Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -891,8 +891,7 @@ jobs:
PYTHONPATH: "${{ env.BUILD_DIR }}:"
run: |
source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
python3 -m pytest -v ${{ env.SRC_DIR }}/tests/python_tests/test_llm_pipeline.py
# python3 -m pytest -v ${{ env.SRC_DIR }}/tests/python_tests/test_structured_output.py
python3 -m pytest -v ${{ env.SRC_DIR }}/tests/python_tests/test_llm_pipeline.py -k "not test_perf_metrics_with_structured_output"

Overall_Status:
name: ci/gha_overall_status_linux
Expand Down
24 changes: 24 additions & 0 deletions src/cpp/include/openvino/genai/perf_metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "openvino/genai/visibility.hpp"
#include <vector>
#include <memory>
#include <map>
#include <string>
#include <optional>

namespace ov {
Expand All @@ -27,6 +29,7 @@ using MicroSeconds = std::chrono::duration<float, std::ratio<1, 1000000>>;
* @param m_batch_sizes Batch sizes for each generate call.
* @param m_durations Total durations for each generate call in microseconds.
* @param m_inference_durations Total inference duration for each generate call in microseconds.
* @param m_grammar_compile_times Time to compile the grammar in microseconds.
*/
struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
std::vector<MicroSeconds> generate_durations;
Expand All @@ -39,6 +42,8 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
std::vector<size_t> m_batch_sizes;
std::vector<MicroSeconds> m_durations;
std::vector<MicroSeconds> m_inference_durations;

std::vector<MicroSeconds> m_grammar_compile_times;
};

/**
Expand All @@ -49,6 +54,16 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair {
float std;
};

/**
* @brief Structure to store list of durations in milliseconds.
*/
struct OPENVINO_GENAI_EXPORTS SummaryStats {
float mean;
float std;
float min;
float max;
};

/**
* @brief Holds performance metrics for each generate call.
*
Expand Down Expand Up @@ -76,6 +91,8 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair {
* @param get_generate_duration Returns the mean and standard deviation of generate duration.
* @param get_tokenization_duration Returns the mean and standard deviation of tokenization duration.
* @param get_detokenization_duration Returns the mean and standard deviation of detokenization duration.
* @param get_grammar_compiler_init_times Returns a map with the time to initialize the grammar compiler for each backend in milliseconds.
* @param get_grammar_compile_time Returns the time to compile the grammar in milliseconds.
* @param get_microsec Converts a duration to microseconds.
* @param m_evaluated Flag indicating if raw metrics were evaluated.
* If false, current mean/std TTFT, TPOT, etc. are not actual and evaluate_statistics() should recalculate them.
Expand Down Expand Up @@ -103,6 +120,10 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
MeanStdPair ipot; // Inference time (in ms) per output token.
MeanStdPair throughput; // Tokens per second.

// Time to initialize grammar compiler for each backend in ms.
std::map<std::string, float> grammar_compiler_init_times;
SummaryStats grammar_compile_time; // Time to compile grammar in ms.

MeanStdPair generate_duration;
MeanStdPair inference_duration;
MeanStdPair tokenization_duration = {-1.0f, -1.0f};
Expand All @@ -118,6 +139,9 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT).
MeanStdPair get_ipot(); // Inference time (in ms) per output token.
MeanStdPair get_throughput(); // Tokens per second.

std::map<std::string, float> get_grammar_compiler_init_times();
SummaryStats get_grammar_compile_time(); // in ms

MeanStdPair get_inference_duration(); // in ms
MeanStdPair get_generate_duration(); // in ms
Expand Down
9 changes: 8 additions & 1 deletion src/cpp/src/continuous_batching/pipeline_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,11 +421,12 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
GenerationHandle& generation = generations.at(0);

streamer_ptr->start();

m_sampler->clear_structured_output_compile_times();
while (has_non_finished_requests()) {
try {
const auto infer_start = std::chrono::steady_clock::now();
step();

// During prefill step (or steps if max_batch_size < prompt_len) we don't generate new tokens,
// but still inference took place, so we need to add this time to the total inference duration.
raw_perf_counters.m_inference_durations[0] += MicroSeconds(m_pipeline_metrics.inference_duration);
Expand All @@ -444,6 +445,12 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
stream_tokens(streamer_ptr, generation);
}

auto times = m_sampler->get_structured_output_times();
perf_metrics.grammar_compiler_init_times = times.first;
for (const auto& t: times.second) {
raw_perf_counters.m_grammar_compile_times.emplace_back(t);
}

// waiting for competion of streaming
streamer_ptr->end();

Expand Down
3 changes: 3 additions & 0 deletions src/cpp/src/generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) {
read_anymap_param(properties, "assistant_confidence_threshold", assistant_confidence_threshold);
read_anymap_param(properties, "num_assistant_tokens", num_assistant_tokens);
read_anymap_param(properties, "max_ngram_size", max_ngram_size);

// Structured output
read_anymap_param(properties, "structured_output_config", structured_output_config);
}

StructuredOutputConfig::StructuredOutputConfig(const ov::AnyMap& properties) {
Expand Down
40 changes: 39 additions & 1 deletion src/cpp/src/perf_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ ov::genai::MeanStdPair calc_mean_and_std(const std::vector<ov::genai::MicroSecon
return {mean, std};
}

ov::genai::SummaryStats calc_full_stat(const std::vector<ov::genai::MicroSeconds>& durations) {
if (durations.size() == 0) {
return {-1, -1, -1, -1};
}
auto minmax = std::minmax_element(durations.begin(), durations.end());
auto meanstd = calc_mean_and_std(durations);
return {meanstd.mean, meanstd.std, minmax.first->count() / 1000.0f, minmax.second->count() / 1000.0f};
}

float PerfMetrics::get_load_time() {
return load_time;
}
Expand Down Expand Up @@ -84,6 +93,16 @@ MeanStdPair PerfMetrics::get_inference_duration() {
return inference_duration;
}

std::map<std::string, float> PerfMetrics::get_grammar_compiler_init_times() {
return grammar_compiler_init_times;
}

SummaryStats PerfMetrics::get_grammar_compile_time() {
evaluate_statistics();
return grammar_compile_time;
}


float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) {
return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
}
Expand Down Expand Up @@ -124,6 +143,8 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
ipot = calc_mean_and_std(raw_metrics.m_token_infer_durations);
ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token);

grammar_compile_time = calc_full_stat(raw_metrics.m_grammar_compile_times);

generate_duration = calc_mean_and_std(raw_metrics.generate_durations);
tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations);
detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations);
Expand All @@ -136,9 +157,22 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {

PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const {
OPENVINO_ASSERT(right.load_time == load_time, "generation metrics can be accumulated only for the same pipeline");

// Copy left value to res.
PerfMetrics res = *this;

// maps with grammar compiler init times should not have conflicting keys
// {{"xgrammar", 10}} + {{"llmguidance", 20}} = {{"grammar", 10}, {"llmguidance", 20}} - is OK!
// {{"xgrammar", 10}} + {{"xgrammar", 10}} = {{"xgrammar", 10}} - is OK!
// {{"xgrammar", 10}} + {{"xgrammar", 20}} = is NOT OK! Fails on assert!
for (const auto& [key, value] : right.grammar_compiler_init_times) {
auto it = res.grammar_compiler_init_times.find(key);
if (it != res.grammar_compiler_init_times.end()) {
OPENVINO_ASSERT(it->second == value, "Grammar compiler init time for the same backend should be the same. ",
"You are trying to accumulate metrics for different pipelines which is not allowed.");
}
res.grammar_compiler_init_times[key] = value;
}

// Concatenate durations, batch_sizes first token times.
auto& new_durations = res.raw_metrics.m_durations;
Expand Down Expand Up @@ -170,6 +204,10 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const {
new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end());
new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end());

// Concatenate structured output compilation times.
auto& new_grammar_compile_times = res.raw_metrics.m_grammar_compile_times;
new_grammar_compile_times.insert(new_grammar_compile_times.end(), right.raw_metrics.m_grammar_compile_times.begin(), right.raw_metrics.m_grammar_compile_times.end());

res.num_generated_tokens += right.num_generated_tokens;
res.num_input_tokens += right.num_input_tokens;
res.m_evaluated = false;
Expand Down
15 changes: 15 additions & 0 deletions src/cpp/src/sampling/sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,21 @@ std::map<size_t, int32_t> Sampler::GroupBeamSearcher::get_beam_idxs() {
return next_beams;
}

std::pair<std::map<std::string, float>, std::vector<float>> Sampler::get_structured_output_times() {
if (m_structured_output_controller) {
return m_structured_output_controller->get_times();
} else {
// If compiled without structured output support, return empty times
return {{}, {}};
}
}

void Sampler::clear_structured_output_compile_times() {
if (m_structured_output_controller) {
m_structured_output_controller->clear_compile_times();
}
}

void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits,
SamplerOutput& sampler_output,
const std::pair<size_t, std::set<std::string>>& stop_strings) {
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/src/sampling/sampler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ class Sampler {
void create_logit_processor(uint64_t request_id, const GenerationConfig& sampling_parameters, const TokenIds& prompt);

std::map<size_t, int32_t> get_beam_idxs(SequenceGroup::CPtr sequence_group);
// pair with map with backend name and corresponding compiler init time, and vector of compile times for each concrete grammar
std::pair<std::map<std::string, float>, std::vector<float>> get_structured_output_times();
void clear_structured_output_compile_times();
};

class Sampler::GroupBeamSearcher {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ StructuredOutputController::get_logits_transformer(const ov::genai::GenerationCo
}
std::string backend_name = (*guided_gen_config).backend.value_or(get_default_backend_name());

std::unique_lock<std::mutex> lock(m_mutex);

// Check if backend already instantiated
auto impl_it = m_impls.find(backend_name);
if (impl_it == m_impls.end()) {
Expand All @@ -52,12 +54,28 @@ StructuredOutputController::get_logits_transformer(const ov::genai::GenerationCo
}

// Create the backend instance and store it
const auto start = std::chrono::steady_clock::now();
m_impls[backend_name] = factory_it->second(m_tokenizer, m_vocab_size);
impl_it = m_impls.find(backend_name);
const auto end = std::chrono::steady_clock::now();
m_init_grammar_compiler_times[backend_name] = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
}

// Use the instantiated backend
return impl_it->second->get_logits_transformer(sampling_parameters);
const auto start = std::chrono::steady_clock::now();
auto res = impl_it->second->get_logits_transformer(sampling_parameters);
const auto end = std::chrono::steady_clock::now();
m_grammar_compile_times.emplace_back(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count());
return res;
}

std::pair<std::map<std::string, float>, std::vector<float>> StructuredOutputController::get_times() const {
return {m_init_grammar_compiler_times, m_grammar_compile_times};
}

void StructuredOutputController::clear_compile_times() {
std::lock_guard<std::mutex> lock(m_mutex);
m_grammar_compile_times.clear();
}

} // namespace genai
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,16 @@ class StructuredOutputController {
static void set_default_backend(const std::string& name);
static std::string& get_default_backend_name();
static std::unordered_map<std::string, BackendFactory>& get_backend_registry();


std::pair<std::map<std::string, float>, std::vector<float>> get_times() const;
void clear_compile_times();
private:
std::map<std::string, float> m_init_grammar_compiler_times;
std::vector<float> m_grammar_compile_times;
std::unordered_map<std::string, std::unique_ptr<IStructuredOutputImpl>> m_impls;
const ov::genai::Tokenizer& m_tokenizer;
std::optional<int> m_vocab_size;
std::mutex m_mutex;
};

} // namespace genai
Expand Down
41 changes: 40 additions & 1 deletion src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ from __future__ import annotations
import openvino._pyopenvino
import os
import typing
__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'ExtendedPerfMetrics', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'ImageGenerationPerfMetrics', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawImageGenerationPerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'SDPerModelsPerfMetrics', 'SDPerfMetrics', 'Scheduler', 'SchedulerConfig', 'SpeechGenerationConfig', 'SpeechGenerationPerfMetrics', 'StopCriteria', 'StreamerBase', 'StreamingStatus', 'StructuredOutputConfig', 'T5EncoderModel', 'Text2ImagePipeline', 'Text2SpeechDecodedResults', 'Text2SpeechPipeline', 'TextEmbeddingPipeline', 'TextStreamer', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version']
__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'ExtendedPerfMetrics', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'ImageGenerationPerfMetrics', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawImageGenerationPerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'SDPerModelsPerfMetrics', 'SDPerfMetrics', 'Scheduler', 'SchedulerConfig', 'SpeechGenerationConfig', 'SpeechGenerationPerfMetrics', 'StopCriteria', 'StreamerBase', 'StreamingStatus', 'StructuredOutputConfig', 'SummaryStats', 'T5EncoderModel', 'Text2ImagePipeline', 'Text2SpeechDecodedResults', 'Text2SpeechPipeline', 'TextEmbeddingPipeline', 'TextStreamer', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version']
class Adapter:
"""
Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier.
Expand Down Expand Up @@ -546,6 +546,12 @@ class ExtendedPerfMetrics:
:param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds.
:type get_detokenization_duration: MeanStdPair

:param get_grammar_compiler_init_times: Returns a map with the time to initialize the grammar compiler for each backend in milliseconds.
:type get_grammar_compiler_init_times: dict[str, float]

:param get_grammar_compile_time: Returns the mean, standard deviation, min, and max of grammar compile times in milliseconds.
:type get_grammar_compile_time: SummaryStats

:param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics.
:type raw_metrics: RawPerfMetrics
"""
Expand Down Expand Up @@ -1428,6 +1434,12 @@ class PerfMetrics:
:param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds.
:type get_detokenization_duration: MeanStdPair

:param get_grammar_compiler_init_times: Returns a map with the time to initialize the grammar compiler for each backend in milliseconds.
:type get_grammar_compiler_init_times: dict[str, float]

:param get_grammar_compile_time: Returns the mean, standard deviation, min, and max of grammar compile times in milliseconds.
:type get_grammar_compile_time: SummaryStats

:param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics.
:type raw_metrics: RawPerfMetrics
"""
Expand All @@ -1441,6 +1453,10 @@ class PerfMetrics:
...
def get_generate_duration(self) -> MeanStdPair:
...
def get_grammar_compile_time(self) -> SummaryStats:
...
def get_grammar_compiler_init_times(self) -> dict[str, float]:
...
def get_inference_duration(self) -> MeanStdPair:
...
def get_ipot(self) -> MeanStdPair:
Expand Down Expand Up @@ -1557,6 +1573,9 @@ class RawPerfMetrics:

:param inference_durations : Total inference duration for each generate call in milliseconds.
:type batch_sizes: list[float]

:param grammar_compile_times: Time to compile the grammar in milliseconds.
:type grammar_compile_times: list[float]
"""
def __init__(self) -> None:
...
Expand All @@ -1567,6 +1586,9 @@ class RawPerfMetrics:
def generate_durations(self) -> list[float]:
...
@property
def grammar_compile_times(self) -> list[float]:
...
@property
def inference_durations(self) -> list[float]:
...
@property
Expand Down Expand Up @@ -1987,6 +2009,23 @@ class StructuredOutputConfig:
@regex.setter
def regex(self, arg0: str | None) -> None:
...
class SummaryStats:
def __init__(self) -> None:
...
def as_tuple(self) -> tuple:
...
@property
def max(self) -> float:
...
@property
def mean(self) -> float:
...
@property
def min(self) -> float:
...
@property
def std(self) -> float:
...
class T5EncoderModel:
"""
T5EncoderModel class.
Expand Down
Loading