diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 4fca4037301fb..71292f5c72ecb 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -21,19 +21,18 @@ namespace onnxruntime { namespace openvino_ep { -GlobalContext& BackendManager::GetGlobalContext() { +GlobalContext* BackendManager::GetGlobalContext() { return global_context_; } -BackendManager::BackendManager(const GlobalContext& global_context, +BackendManager::BackendManager(GlobalContext* global_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, EPCtxHandler& ep_ctx_handle_) { global_context_ = global_context; - - openvino_sdk_version_ = std::to_string(global_context_.OpenVINO_Version.at(0)) + "." + - std::to_string(global_context_.OpenVINO_Version.at(1)); + openvino_sdk_version_ = std::to_string(global_context_->OpenVINO_Version.at(0)) + "." + + std::to_string(global_context_->OpenVINO_Version.at(1)); if (ep_ctx_handle_.CheckForOVEPCtxNode(subgraph, openvino_sdk_version_)) { if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph) != Status::OK()) ORT_THROW("Import blob from model failed"); @@ -66,17 +65,17 @@ BackendManager::BackendManager(const GlobalContext& global_context, } subgraph_context_.subgraph_name = fused_node.Name(); auto model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger); - std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type; + std::string device_type = openvino_ep::BackendManager::GetGlobalContext()->device_type; if (ModelHasSymbolicInputDims(subgraph)) { subgraph_context_.has_dynamic_input_shape = true; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims"; - ORT_ENFORCE(!global_context_.enable_qdq_optimizer, + ORT_ENFORCE(!global_context_->enable_qdq_optimizer, "QDQ stripping should not be enabled for models with dynamic input shapes. " "Set enable_qdq_optimizer to False"); - if ((GetGlobalContext().device_type.find("CPU") != std::string::npos || - GetGlobalContext().device_type.find("GPU") != std::string::npos) && - !GetGlobalContext().disable_dynamic_shapes) { + if ((GetGlobalContext()->device_type.find("CPU") != std::string::npos || + GetGlobalContext()->device_type.find("GPU") != std::string::npos) && + !GetGlobalContext()->disable_dynamic_shapes) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. " << "Creating backend Dynamic Shapes"; try { @@ -110,7 +109,7 @@ BackendManager::BackendManager(const GlobalContext& global_context, } catch (const OnnxRuntimeException& ex) { std::string exception_str = ex.what(); bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos && - !GetGlobalContext().disable_cpu_fallback && + !GetGlobalContext()->disable_cpu_fallback && !ep_ctx_handle_.IsValidOVEPCtxGraph(); #if defined(OPENVINO_DISABLE_NPU_FALLBACK) eligible_for_cpu_fallback = false; @@ -119,8 +118,8 @@ BackendManager::BackendManager(const GlobalContext& global_context, LOGS_DEFAULT(VERBOSE) << exception_str; LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; - GetGlobalContext().device_type = "CPU"; - GetGlobalContext().precision_str = "FP32"; + GetGlobalContext()->device_type = "CPU"; + GetGlobalContext()->precision_str = "FP32"; try { concrete_backend_ = BackendFactory::MakeBackend(model_proto, GetGlobalContext(), @@ -157,7 +156,7 @@ BackendManager::BackendManager(const GlobalContext& global_context, } } } - if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) { + if (global_context_->export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) { auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph, logger); if ((!status.IsOK())) { @@ -172,7 +171,7 @@ BackendManager::BackendManager(const GlobalContext& global_context, // the EPContext node. Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer, const logging::Logger& logger) { - if (GetGlobalContext().disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) { + if (GetGlobalContext()->disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) { std::string exception_str = "Exporting dynamically compiled models at runtime is not supported. " "Cannot export blobs of dynamic models that request static shape inference. " @@ -184,19 +183,19 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie auto compiled_model = concrete_backend_->GetOVCompiledModel(); std::string graph_name = ""; // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability - if (!global_context_.cache_dir.empty()) { - graph_name = global_context_.cache_dir; + if (!global_context_->cache_dir.empty()) { + graph_name = global_context_->cache_dir; } else { - graph_name = global_context_.onnx_model_path_name; + graph_name = global_context_->onnx_model_path_name; // Remove extension so we can append suffix to form the complete name of output graph - size_t dot = global_context_.onnx_model_path_name.find_last_of("."); + size_t dot = global_context_->onnx_model_path_name.find_last_of("."); graph_name = graph_name.substr(0, dot); if (dot != std::string::npos) graph_name += "_ctx.onnx"; } // If embed_mode, then pass on the serialized blob // If not embed_mode, dump the blob here and only pass on the path to the blob - if (global_context_.ep_context_embed_mode) { + if (global_context_->ep_context_embed_mode) { std::ostringstream model_blob_stream; compiled_model.export_model(model_blob_stream); model_blob_str = std::move(model_blob_stream).str(); @@ -218,7 +217,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie ORT_RETURN_IF_ERROR(ep_ctx_handle_.ExportEPCtxModel(graph_body_viewer, graph_name, logger, - global_context_.ep_context_embed_mode, + global_context_->ep_context_embed_mode, std::move(model_blob_str), openvino_sdk_version_)); @@ -337,8 +336,8 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, }; // QDQ stripping enabled only for the NPU - if (global_context_.device_type.find("NPU") != std::string::npos && - global_context_.enable_qdq_optimizer && + if (global_context_->device_type.find("NPU") != std::string::npos && + global_context_->enable_qdq_optimizer && IsQDQGraph(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1"; std::unique_ptr model; @@ -346,7 +345,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, auto model_proto = model->ToProto(); model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); print_model_proto_duration(); - DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node); + DumpOpenVINOEPModel(global_context_->onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; } else { @@ -356,7 +355,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); subgraph.ToProto(*model_proto->mutable_graph(), true, true); print_model_proto_duration(); - DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node); + DumpOpenVINOEPModel(global_context_->onnx_model_path_name, model_proto.get(), fused_node); return model_proto; } } @@ -448,13 +447,13 @@ void BackendManager::Compute(OrtKernelContext* context) { // by rewriting the model to static shaped model at runtime based on input shape. // disable_dynamic_shapes is always set to true for OV NPU plugin. if (subgraph_context_.has_dynamic_input_shape && - !GetGlobalContext().disable_dynamic_shapes && - (GetGlobalContext().device_type.find("CPU") != std::string::npos || - GetGlobalContext().device_type.find("GPU") != std::string::npos)) { + !GetGlobalContext()->disable_dynamic_shapes && + (GetGlobalContext()->device_type.find("CPU") != std::string::npos || + GetGlobalContext()->device_type.find("GPU") != std::string::npos)) { concrete_backend_->Infer(context); } else if (subgraph_context_.has_dynamic_input_shape) { std::vector> tensor_shapes = GetInputTensorShapes(ctx); - auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type); + auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext()->device_type); std::shared_ptr dynamic_backend; auto search = backend_map_.find(key); if (search == backend_map_.end()) { @@ -474,14 +473,14 @@ void BackendManager::Compute(OrtKernelContext* context) { LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."; ORT_THROW(ex.what()); #else - if (GetGlobalContext().device_type.find("NPU") != std::string::npos && - !GetGlobalContext().disable_cpu_fallback) { + if (GetGlobalContext()->device_type.find("NPU") != std::string::npos && + !GetGlobalContext()->disable_cpu_fallback) { LOGS_DEFAULT(WARNING) << ex.what(); LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; - GetGlobalContext().device_type = "CPU"; - GetGlobalContext().precision_str = "FP32"; - key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type); + GetGlobalContext()->device_type = "CPU"; + GetGlobalContext()->precision_str = "FP32"; + key = MakeMapKeyString(tensor_shapes, GetGlobalContext()->device_type); try { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, GetGlobalContext(), diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index b9ff7a72372b3..578c1c199f832 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -19,7 +19,7 @@ namespace openvino_ep { // Singleton class that manages all the backends class BackendManager { public: - BackendManager(const GlobalContext& global_context, + BackendManager(GlobalContext* global_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, @@ -27,7 +27,7 @@ class BackendManager { void Compute(OrtKernelContext* context); void ShutdownBackendManager(); void SetGlobalCotext(const GlobalContext& global_context); - GlobalContext& GetGlobalContext(); + GlobalContext* GetGlobalContext(); Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger); @@ -51,7 +51,7 @@ class BackendManager { std::shared_ptr concrete_backend_; std::map> backend_map_; SubGraphContext subgraph_context_; - GlobalContext global_context_; + GlobalContext* global_context_; EPCtxHandler ep_ctx_handle_{}; std::string openvino_sdk_version_{}; }; diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 4d9fbe09f118d..51828938824c4 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -41,17 +41,17 @@ struct static_cast_int64 { }; std::shared_ptr -CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, +CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext* global_context, std::map>& const_outputs_map) { if (IsCILogEnabled()) { std::cout << "CreateNgraphFunc" << std::endl; } const std::string model = model_proto.SerializeAsString(); try { - auto cnn_network = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name); + auto cnn_network = global_context->ie_core.ReadModel(model, global_context->onnx_model_path_name); // Check for Constant Folding - if (!global_context.is_wholly_supported_graph) { + if (!global_context->is_wholly_supported_graph) { ov::pass::ConstantFolding pass_const_obj; pass_const_obj.run_on_model(cnn_network); auto& results = const_cast(cnn_network.get()->get_results()); @@ -130,13 +130,13 @@ GetOutputTensor(Ort::KernelContext& context, return context.GetOutput(index, output_shape.get(), num_dims); } -int GetFirstAvailableDevice(GlobalContext& global_context) { +int GetFirstAvailableDevice(GlobalContext* global_context) { int i = 0; // Get the first available VAD-M device and set the device to busy while (i < 8) { - bool device = global_context.deviceAvailableList[i]; + bool device = global_context->deviceAvailableList[i]; if (device) { - global_context.deviceAvailableList[i] = false; + global_context->deviceAvailableList[i] = false; break; } i++; @@ -145,9 +145,9 @@ int GetFirstAvailableDevice(GlobalContext& global_context) { // make all remaining devices free if (i == 8) { i = 0; - global_context.deviceAvailableList[i] = false; + global_context->deviceAvailableList[i] = false; for (int j = 1; j < 8; j++) { - global_context.deviceAvailableList[j] = true; + global_context->deviceAvailableList[j] = true; } } return i; diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index a105e6b08aade..915fb75a5eb5d 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -34,7 +34,7 @@ bool IsDebugEnabled(); // Internal diagnostic function. bool IsCILogEnabled(); -int GetFirstAvailableDevice(GlobalContext& global_context); +int GetFirstAvailableDevice(GlobalContext* global_context); void FillOutputsWithConstantData(std::shared_ptr node, Ort::UnownedValue& out_tensor); @@ -62,7 +62,7 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor, std::shared_ptr CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, - const GlobalContext& global_context, + const GlobalContext* global_context, std::map>& const_outputs_map); void printPerformanceCounts(const std::vector& performanceMap, diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc index b7e4aed6e7e18..88410a8f75e88 100644 --- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc +++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc @@ -12,10 +12,10 @@ namespace openvino_ep { std::shared_ptr BackendFactory::MakeBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + GlobalContext* global_context, const SubGraphContext& subgraph_context, EPCtxHandler& ep_ctx_handle) { - std::string type = global_context.device_type; + std::string type = global_context->device_type; if (type == "CPU" || type.find("GPU") != std::string::npos || type.find("NPU") != std::string::npos || type.find("HETERO") != std::string::npos || diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index bfd79bb960dcd..54448067337d7 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -21,11 +21,11 @@ namespace openvino_ep { using namespace backend_utils; BasicBackend::BasicBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + GlobalContext* global_context, const SubGraphContext& subgraph_context, EPCtxHandler& ep_ctx_handle) : global_context_(global_context), subgraph_context_(subgraph_context) { - std::string& hw_target = global_context_.device_type; + std::string& hw_target = global_context_->device_type; is_ep_ctx_graph_ = ep_ctx_handle.IsValidOVEPCtxGraph(); @@ -48,41 +48,43 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr // Set the inference_num_threads property of the CPU SetNumThreads(device_config); + // set workload type to decide on the performance mode + SetWorkLoadType(device_config); + try { - std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str; + std::string dev_prec = global_context_->device_type + "_" + global_context_->precision_str; - if (global_context.is_wholly_supported_graph) { // Full graph is supported + if (global_context_->is_wholly_supported_graph) { // Full graph is supported #if defined(IO_BUFFER_ENABLED) if (is_ep_ctx_graph_) { std::istringstream model_stream(ep_ctx_handle.GetModelBlobString()); - exe_network_ = global_context_.ie_core.ImportModel(model_stream, - remote_context_, - subgraph_context_.subgraph_name); - } else if ((global_context.device_type.find("GPU") != std::string::npos) && - (global_context_.context != nullptr)) { + exe_network_ = global_context_->ie_core.ImportModel(model_stream, + remote_context_, + subgraph_context_.subgraph_name); + } else if ((global_context_->device_type.find("GPU") != std::string::npos) && + (global_context_->context != nullptr)) { LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; - cl_context ctx = static_cast(global_context_.context); - remote_context_ = new ov::intel_gpu::ocl::ClContext(global_context_.ie_core.Get(), ctx); + cl_context ctx = static_cast(global_context_->context); + remote_context_ = new ov::intel_gpu::ocl::ClContext(global_context_->ie_core.Get(), ctx); ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( + exe_network_ = global_context_->ie_core.CompileModel( ie_cnn_network_, remote_context_, subgraph_context_.subgraph_name); } else { ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( + exe_network_ = global_context_->ie_core.CompileModel( ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); } #else // !IO_BUFFER_ENABLED - std::string prec_str = (global_context_.precision_str != "ACCURACY") ? global_context_.precision_str : global_context_.model_precision; + std::string prec_str = (global_context_->precision_str != "ACCURACY") ? global_context_->precision_str : global_context_->model_precision; if (is_ep_ctx_graph_) { // If the blob is held in an EPContext node, then skip FE+Compile // and directly move on to creating a backend with the executable blob - exe_network_ = global_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(), - hw_target, - device_config, - global_context_.ep_context_embed_mode, - subgraph_context_.subgraph_name); - ie_cnn_network_ = exe_network_.Get().get_runtime_model(); - } else if (global_context_.export_ep_ctx_blob && + exe_network_ = global_context_->ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(), + hw_target, + device_config, + global_context_->ep_context_embed_mode, + subgraph_context_.subgraph_name); + } else if (global_context_->export_ep_ctx_blob && hw_target.find("NPU") != std::string::npos) { std::shared_ptr ov_model; { @@ -90,28 +92,28 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr if (!subgraph_context.has_dynamic_input_shape) { delete model_proto.release(); } - ov_model = global_context_.ie_core.Get().read_model(model, ov::Tensor()); + ov_model = global_context_->ie_core.Get().read_model(model, ov::Tensor()); } - exe_network_ = OVExeNetwork(global_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config)); + exe_network_ = OVExeNetwork(global_context_->ie_core.Get().compile_model(ov_model, hw_target, device_config)); } else if ((!subgraph_context_.has_dynamic_input_shape) && ((hw_target.find("AUTO") == std::string::npos) || - (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) > 2))) { + (global_context_->OpenVINO_Version.at(0) >= 2024 && global_context_->OpenVINO_Version.at(1) > 2))) { // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above // Inputs with static dimenstions const std::string model = model_proto->SerializeAsString(); - exe_network_ = global_context_.ie_core.CompileModel(model, - hw_target, - device_config, - subgraph_context_.subgraph_name); + exe_network_ = global_context_->ie_core.CompileModel(model, + hw_target, + device_config, + subgraph_context_.subgraph_name); } else { // For all other types use ov::Model Type ie_cnn_network_ = CreateOVModel(*model_proto, global_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( + exe_network_ = global_context_->ie_core.CompileModel( ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); } #endif } else { // Full graph is not supported ie_cnn_network_ = CreateOVModel(*model_proto, global_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( + exe_network_ = global_context_->ie_core.CompileModel( ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); } LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; @@ -135,21 +137,21 @@ bool BasicBackend::ValidateSubgraph(std::mapprecision_str.find("FP16") != std::string::npos && + global_context_->device_type == "GPU") { device_config.emplace(ov::hint::inference_precision("f16")); } - if (global_context_.precision_str.find("FP32") != std::string::npos) { + if (global_context_->precision_str.find("FP32") != std::string::npos) { device_config.emplace(ov::hint::inference_precision("f32")); } - if (global_context_.precision_str.find("ACCURACY") != std::string::npos && - global_context_.device_type == "GPU") { - if (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) >= 1) { + if (global_context_->precision_str.find("ACCURACY") != std::string::npos && + global_context_->device_type == "GPU") { + if (global_context_->OpenVINO_Version.at(0) >= 2024 && global_context_->OpenVINO_Version.at(1) >= 1) { device_config.emplace(ov::hint::inference_precision(ov::element::undefined)); device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY)); } else { - if (global_context_.model_precision != "") - device_config.emplace(ov::hint::inference_precision(global_context_.model_precision)); + if (global_context_->model_precision != "") + device_config.emplace(ov::hint::inference_precision(global_context_->model_precision)); } } #ifndef NDEBUG @@ -160,10 +162,10 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { // Set a priority level for the current workload for preemption; default priority is "DEFAULT" // CPU Plugin doesn't support workload priority - if (global_context_.device_type.find("CPU") == std::string::npos) - device_config.emplace(ov::hint::model_priority(global_context_.model_priority)); + if (global_context_->device_type.find("CPU") == std::string::npos) + device_config.emplace(ov::hint::model_priority(global_context_->model_priority)); - if (global_context_.device_type.find("NPU") != std::string::npos) { + if (global_context_->device_type.find("NPU") != std::string::npos) { std::pair device_property; device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER"); @@ -173,15 +175,15 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { } device_config.emplace(ov::device::properties("NPU", device_property)); #if (OPENVINO_VERSION_MAJOR >= 2024) && (OPENVINO_VERSION_MINOR > 3) - if (global_context_.export_ep_ctx_blob) { - global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); + if (global_context_->export_ep_ctx_blob) { + global_context_->ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); } #endif } - if (!global_context_.load_config.empty()) { + if (!global_context_->load_config.empty()) { std::map target_config; - LoadConfig(global_context_.load_config, target_config); + LoadConfig(global_context_->load_config, target_config); // Parse device types like "AUTO:CPU,GPU" and extract individual devices auto parse_individual_devices = [&](const std::string& device_type) -> std::vector { @@ -213,7 +215,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { const std::vector& supported_properties) { for (const auto& [key, value] : config_options) { if (is_supported_and_mutable(key, supported_properties)) { - global_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}}); + global_context_->ie_core.Get().set_property(device, ov::AnyMap{{key, value}}); } else { LOGS_DEFAULT(WARNING) << "WARNING: Property \"" << key << "\" is either unsupported in current OpenVINO version" @@ -224,26 +226,26 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { }; // Check if the device type is AUTO, HETERO, or MULTI - if (global_context_.device_type.find("AUTO") == 0 || - global_context_.device_type.find("HETERO") == 0 || - global_context_.device_type.find("MULTI") == 0) { + if (global_context_->device_type.find("AUTO") == 0 || + global_context_->device_type.find("HETERO") == 0 || + global_context_->device_type.find("MULTI") == 0) { // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"]) - auto individual_devices = parse_individual_devices(global_context_.device_type); + auto individual_devices = parse_individual_devices(global_context_->device_type); // Set properties only for individual devices (e.g., "CPU", "GPU") for (const std::string& device : individual_devices) { if (target_config.count(device)) { // Get supported properties for each individual device - auto device_properties = global_context_.ie_core.Get().get_property(device, ov::supported_properties); + auto device_properties = global_context_->ie_core.Get().get_property(device, ov::supported_properties); // Set properties for the device set_target_properties(device, target_config.at(device), device_properties); } } } else { - if (target_config.count(global_context_.device_type)) { - auto supported_properties = global_context_.ie_core.Get().get_property(global_context_.device_type, + if (target_config.count(global_context_->device_type)) { + auto supported_properties = global_context_->ie_core.Get().get_property(global_context_->device_type, ov::supported_properties); - set_target_properties(global_context_.device_type, - target_config.at(global_context_.device_type), supported_properties); + set_target_properties(global_context_->device_type, + target_config.at(global_context_->device_type), supported_properties); } } } @@ -253,21 +255,21 @@ void BasicBackend::EnableCaching(ov::AnyMap& device_config) { // cache_dir argument has no effect when working with an embed-mode EPContext Graph if (is_ep_ctx_graph_) return; - if (!global_context_.cache_dir.empty() && !global_context_.export_ep_ctx_blob) { + if (!global_context_->cache_dir.empty() && !global_context_->export_ep_ctx_blob) { LOGS_DEFAULT(INFO) << log_tag << "Enables Caching"; - if (global_context_.device_type.find("AUTO:GPU") != std::string::npos) { + if (global_context_->device_type.find("AUTO:GPU") != std::string::npos) { std::pair device_property; - device_property = std::make_pair("CACHE_DIR", global_context_.cache_dir); + device_property = std::make_pair("CACHE_DIR", global_context_->cache_dir); device_config.emplace(ov::device::properties("GPU", device_property)); } else { - global_context_.ie_core.SetCache(global_context_.cache_dir); + global_context_->ie_core.SetCache(global_context_->cache_dir); } } } void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) { - if (global_context_.enable_opencl_throttling == true && - global_context_.device_type.find("GPU") != std::string::npos) { + if (global_context_->enable_opencl_throttling == true && + global_context_->device_type.find("GPU") != std::string::npos) { LOGS_DEFAULT(INFO) << log_tag << "Enabled OpenCL queue throttling for GPU device"; std::pair device_property; device_property = std::make_pair("PLUGIN_THROTTLE", "1"); @@ -278,28 +280,38 @@ void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) { void BasicBackend::EnableStreams() { // Return silently for NPU as it's currently treated as a read-only flag by the NPU plugin // and throws an exception for the same - if (global_context_.device_type.find("NPU") != std::string::npos) + if (global_context_->device_type.find("NPU") != std::string::npos) return; // Streams can be set only if the device is not one of AUTO, MULTI, or HETERO // Throw an exception if the user tries to set num_streams for these devices - if ((global_context_.device_type.find("MULTI") != std::string::npos) || - (global_context_.device_type.find("HETERO") != std::string::npos) || - (global_context_.device_type.find("AUTO") != std::string::npos)) { - if (global_context_.num_streams != 1) { + if ((global_context_->device_type.find("MULTI") != std::string::npos) || + (global_context_->device_type.find("HETERO") != std::string::npos) || + (global_context_->device_type.find("AUTO") != std::string::npos)) { + if (global_context_->num_streams != 1) { ORT_THROW(log_tag + "Cannot set NUM_STREAMS to " + - std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type); + std::to_string(global_context_->num_streams) + " for device " + global_context_->device_type); } // Do nothing } else { - global_context_.ie_core.SetStreams(global_context_.device_type, global_context_.num_streams); + global_context_->ie_core.SetStreams(global_context_->device_type, global_context_->num_streams); } } void BasicBackend::SetNumThreads(ov::AnyMap& device_config) { // inference_num_threads is applicable only for the CPU device - if (global_context_.device_type.find("CPU") != std::string::npos) - device_config.emplace(ov::inference_num_threads(global_context_.num_of_threads)); + if (global_context_->device_type.find("CPU") != std::string::npos) + device_config.emplace(ov::inference_num_threads(global_context_->num_of_threads)); +} + +void BasicBackend::SetWorkLoadType(ov::AnyMap& device_config) { + if ((global_context_->OpenVINO_Version.at(0) >= 2024 && + global_context_->OpenVINO_Version.at(1) >= 3)) { + std::pair device_property; + device_property = std::make_pair("WORKLOAD_TYPE", global_context_->workload_type); + device_config.emplace(ov::device::properties("NPU", device_property)); + LOGS_DEFAULT(INFO) << log_tag << "Set compile time workloadtype as " << global_context_->workload_type; + } } // Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on @@ -330,9 +342,9 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } size_t batch_slice_idx = 0; if (subgraph_context_.has_dynamic_input_shape && - !global_context_.disable_dynamic_shapes && - (global_context_.device_type.find("CPU") != std::string::npos || - global_context_.device_type.find("GPU") != std::string::npos)) { + !global_context_->disable_dynamic_shapes && + (global_context_->device_type.find("CPU") != std::string::npos || + global_context_->device_type.find("GPU") != std::string::npos)) { auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name)); auto tensor_info = tensor.GetTensorTypeAndShapeInfo(); auto tensor_shape = tensor_info.GetShape(); @@ -347,7 +359,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque auto input = graph_input_info.at(input_idx); OVTensorPtr tensor_ptr; // avoid input copies on the CPU device - if (global_context_.device_type.find("CPU") != std::string::npos) { + if (global_context_->device_type.find("CPU") != std::string::npos) { tensor_ptr = std::make_shared(input.get_element_type(), input_tensor_shape, (void*)tensor_data); } else { @@ -361,8 +373,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque ORT_THROW(msg); } } else { - if ((global_context_.device_type.find("CPU") != std::string::npos || - global_context_.device_type.find("GPU") != std::string::npos)) { + if ((global_context_->device_type.find("CPU") != std::string::npos || + global_context_->device_type.find("GPU") != std::string::npos)) { OVTensorPtr graph_input_blob; try { graph_input_blob = infer_request->GetTensor(input_name); @@ -394,7 +406,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } input_idx++; } - if (global_context_.device_type.find("NPU") != std::string::npos) { + if (global_context_->device_type.find("NPU") != std::string::npos) { // Set the output blob as remote blob auto graph_output_info = exe_network_.Get().outputs(); auto output_idx = 0; @@ -440,6 +452,11 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } // Start Async inference + std::string runtime_workload_type = global_context_->runtime_workload_type; + if (runtime_workload_type == "DEFAULT" || runtime_workload_type == "EFFICIENT") { + LOGS_DEFAULT(VERBOSE) << "[OpenVINO-EP]" << global_context_->runtime_workload_type << " mode is set for OV inference"; + exe_network_.Get().set_property(ov::workload_type(runtime_workload_type)); + } infer_request->StartAsync(); } catch (const char* msg) { ORT_THROW(msg); @@ -548,6 +565,11 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe } // Start Async inference + std::string runtime_workload_type = global_context_->runtime_workload_type; + if (runtime_workload_type == "DEFAULT" || runtime_workload_type == "EFFICIENT") { + LOGS_DEFAULT(VERBOSE) << "[OpenVINO-EP]" << global_context_->runtime_workload_type << " mode is set for OV inference"; + exe_network_.Get().set_property(ov::workload_type(runtime_workload_type)); + } infer_request->StartAsync(); } catch (const char* msg) { ORT_THROW(msg); @@ -588,8 +610,8 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe " doesn't exist in the " "list of OpenVINO output tensor names"); } - if ((global_context_.device_type.find("CPU") != std::string::npos || - global_context_.device_type.find("GPU") != std::string::npos)) { + if ((global_context_->device_type.find("CPU") != std::string::npos || + global_context_->device_type.find("GPU") != std::string::npos)) { try { graph_output_blob = infer_request->GetTensor(output_name); } catch (const char* msg) { @@ -664,8 +686,8 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { infer_request = inferRequestsQueue_->getIdleRequest(); #ifdef IO_BUFFER_ENABLED - if ((global_context_.device_type.find("GPU") != std::string::npos) && - (global_context_.context != nullptr) && global_context_.is_wholly_supported_graph) { + if ((global_context_->device_type.find("GPU") != std::string::npos) && + (global_context_->context != nullptr) && global_context_->is_wholly_supported_graph) { try { StartRemoteAsyncInference(context, infer_request); } catch (std::string const& msg) { @@ -709,7 +731,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { #ifndef IO_BUFFER_ENABLED // Printing performance counts is disabled when IO_BUFFER_ENABLED if (openvino_ep::backend_utils::IsDebugEnabled()) { inferRequestsQueue_->printstatus(); // Printing the elements of infer_requests_ vector pool only in debug mode - std::string& hw_target = global_context_.device_type; + std::string& hw_target = global_context_->device_type; printPerformanceCounts(std::move(infer_request_), std::cout, hw_target); } #endif diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 12502a1d83c5d..5143c10227377 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -30,7 +30,7 @@ class InferRequestsQueue; class BasicBackend : public IBackend { public: BasicBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + GlobalContext* global_context, const SubGraphContext& subgraph_context, EPCtxHandler& ep_ctx_handle); @@ -47,6 +47,7 @@ class BasicBackend : public IBackend { void EnableGPUThrottling(ov::AnyMap& device_config); void EnableStreams(); void SetNumThreads(ov::AnyMap& device_config); + void SetWorkLoadType(ov::AnyMap& device_config); void StartAsyncInference(Ort::KernelContext& context, std::shared_ptr infer_request); #ifdef IO_BUFFER_ENABLED @@ -55,7 +56,7 @@ class BasicBackend : public IBackend { void CompleteAsyncInference(Ort::KernelContext& context, std::shared_ptr infer_request); - GlobalContext& global_context_; + GlobalContext* global_context_; SubGraphContext subgraph_context_; mutable std::mutex compute_lock_; std::shared_ptr ie_cnn_network_; diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 2d238917eb8ed..36e449c4b7e44 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -33,6 +33,8 @@ struct GlobalContext { std::vector deviceAvailableList = {true, true, true, true, true, true, true, true}; std::string onnx_model_name; std::string onnx_model_path_name; + std::string workload_type; + std::string runtime_workload_type = ""; int onnx_opset_version; void* context = 0; bool use_api_2; diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 7a2d6f4e8cd69..d6a836073f3eb 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -21,7 +21,7 @@ class BackendFactory { public: static std::shared_ptr MakeBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + GlobalContext* global_context, const SubGraphContext& subgraph_context, EPCtxHandler& ctx_handle); }; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index c55e7a607e496..1a01d6788ff21 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -13,6 +13,7 @@ #ifdef USE_OVEP_NPU_MEMORY #include "core/providers/openvino/ov_allocator.h" #endif +#include "core/session/onnxruntime_run_options_config_keys.h" #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz)) @@ -39,6 +40,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv global_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_; global_context_->disable_cpu_fallback = info.disable_cpu_fallback_; global_context_->ep_context_embed_mode = info.so_epctx_embed_mode_; + global_context_->workload_type = info.so_workload_type_; // to check if target device is available // using ie_core capability GetAvailableDevices to fetch list of devices plugged in @@ -143,14 +145,12 @@ common::Status OpenVINOExecutionProvider::Compile( // During backend creation, we check if user wants to use precompiled blob onnx model or the original model // For precompiled blob, directly load the model instead of compiling the model // For original model, check if the user wants to export a model with pre-compiled blob - std::shared_ptr backend_manager = - std::make_shared(*global_context_, + std::make_shared(global_context_.get(), fused_node, graph_body_viewer, *GetLogger(), ep_ctx_handle_); - compute_info.create_state_func = [backend_manager](ComputeContext* context, FunctionState* state) { OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState(); @@ -180,10 +180,27 @@ common::Status OpenVINOExecutionProvider::Compile( }; node_compute_funcs.push_back(compute_info); } - + return Status::OK(); +} +common::Status OpenVINOExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) { + auto workload_type_opt = run_options.GetConfigOptions().GetConfigEntry(kOrtRunOptionsWorkloadType); + if (workload_type_opt.has_value()) { + std::string workload_type = workload_type_opt.value(); + LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" << "Workload type from ORT RunOption = " << workload_type; + std::transform(workload_type.begin(), workload_type.end(), workload_type.begin(), ::tolower); + if (workload_type == "default") { + global_context_->runtime_workload_type = "DEFAULT"; + } else if (workload_type == "efficient") { + global_context_->runtime_workload_type = "EFFICIENT"; + } + } return Status::OK(); } +common::Status OpenVINOExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& run_options) { + global_context_->runtime_workload_type = global_context_->workload_type; + return Status::OK(); +} #ifdef USE_OVEP_NPU_MEMORY std::vector OpenVINOExecutionProvider::CreatePreferredAllocators() { AllocatorCreationInfo npu_allocator_info{ diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index cc09a4c6878b0..8171712687fb2 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -92,6 +92,7 @@ struct OpenVINOExecutionProviderInfo { bool enable_qdq_optimizer_{false}; bool disable_cpu_fallback_{false}; bool so_epctx_embed_mode_{true}; + std::string so_workload_type_{""}; OpenVINOExecutionProviderInfo() = delete; @@ -102,7 +103,7 @@ struct OpenVINOExecutionProviderInfo { void* context, bool enable_opencl_throttling, bool disable_dynamic_shapes, bool export_ep_ctx_blob, bool enable_qdq_optimizer, bool disable_cpu_fallback, - bool so_epctx_embed_mode) + bool so_epctx_embed_mode, std::string so_workload_type) : precision_(std::move(precision)), enable_npu_fast_compile_(enable_npu_fast_compile), num_of_threads_(num_of_threads), @@ -116,7 +117,8 @@ struct OpenVINOExecutionProviderInfo { export_ep_ctx_blob_(export_ep_ctx_blob), enable_qdq_optimizer_(enable_qdq_optimizer), disable_cpu_fallback_(disable_cpu_fallback), - so_epctx_embed_mode_{so_epctx_embed_mode} { + so_epctx_embed_mode_{so_epctx_embed_mode}, + so_workload_type_(so_workload_type) { std::set ov_supported_device_types = {"CPU", "GPU", "GPU.0", "GPU.1", "NPU"}; @@ -188,6 +190,9 @@ class OpenVINOExecutionProvider : public IExecutionProvider { Status Compile(const std::vector& fused_nodes, std::vector& node_compute_funcs) override; + Status OnRunStart(const onnxruntime::RunOptions& run_options) override; + + Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override; const void* GetExecutionHandle() const noexcept override { return nullptr; diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index c69d53638ae90..5eba00ba6c419 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -53,7 +53,7 @@ std::unique_ptr OpenVINOProviderFactory::CreateProvider() { bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault("ep.context_enable", "0") == "1"; bool so_epctx_embed_mode = config_options_.GetConfigOrDefault("ep.context_embed_mode", "1") == "1"; std::string so_cache_path = config_options_.GetConfigOrDefault("ep.context_file_path", "").c_str(); - + std::string so_workload_type_ = config_options_.GetConfigOrDefault("session.workload_type", "").c_str(); if (so_export_ep_ctx_blob && !so_cache_path.empty()) { cache_dir_ = so_cache_path; auto file_path = std::filesystem::path(cache_dir_); @@ -70,11 +70,21 @@ std::unique_ptr OpenVINOProviderFactory::CreateProvider() { ORT_THROW("[ERROR] [OpenVINO] Invalid ep_ctx_file_path" + cache_dir_ + " \n"); } } + if (!so_workload_type_.empty()) { + std::transform(so_workload_type_.begin(), so_workload_type_.end(), so_workload_type_.begin(), ::tolower); + if (so_workload_type_ == "default") { + so_workload_type_ = "DEFAULT"; + } else if (so_workload_type_ == "efficient") { + so_workload_type_ = "EFFICIENT"; + } else { + ORT_THROW("[ERROR] [OpenVINO] Invalid workload_type - Supported modes are Default and Efficient \n"); + } + } OpenVINOExecutionProviderInfo info(device_type_, precision_, enable_npu_fast_compile_, num_of_threads_, load_config_, cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_, disable_dynamic_shapes_, so_export_ep_ctx_blob, enable_qdq_optimizer_, - so_disable_cpu_fallback, so_epctx_embed_mode); + so_disable_cpu_fallback, so_epctx_embed_mode, so_workload_type_); return std::make_unique(info); }