diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 8887b183c4396..e150a7cd00ec6 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -45,7 +45,7 @@ BackendManager::BackendManager(SessionContext& session_context, subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph); // If the graph contains a OVIR wrapped node, we check if it has matching xml file name attribute subgraph_context_.is_ep_ctx_ovir_encapsulated = ep_ctx_handle_.CheckEPCacheContextAttribute(subgraph, - session_context_.onnx_model_path_name.filename().replace_extension("xml").string()); + session_context_.onnx_model_path_name.filename().replace_extension("xml").string()); subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) { // return empty if graph has no inputs or if types are not one of FP32/FP16 @@ -91,21 +91,20 @@ BackendManager::BackendManager(SessionContext& session_context, std::string device_type = session_context_.device_type; auto& sw = shared_context_.shared_weights; - if (session_context_.so_share_ep_contexts) { + if (session_context_.so_share_ep_contexts && !sw.metadata.empty()) { std::filesystem::path weight_filename = session_context_.onnx_model_path_name.parent_path(); - if (sw.external_weight_filename.empty() && !sw.metadata.empty()) { + if (sw.external_weight_filename.empty()) { // Reasonable assumption that all metadata entries have the same external file location sw.external_weight_filename = sw.metadata.begin()->second.location; } weight_filename /= sw.external_weight_filename; std::ifstream weight_file(weight_filename); - if (weight_file) { - if (!sw.mapped_weights) { - sw.mapped_weights = std::make_unique(weight_filename); - } - backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, *sw.mapped_weights); + ORT_ENFORCE(weight_file, "Initializer file not found: ", weight_filename.string()); + if (!sw.mapped_weights) { + sw.mapped_weights = std::make_unique(weight_filename); } + backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, *sw.mapped_weights); } if (ModelHasSymbolicInputDims(subgraph)) { @@ -196,7 +195,7 @@ BackendManager::BackendManager(SessionContext& session_context, } } if (session_context_.so_context_enable && - (subgraph_context_.is_ep_ctx_ovir_encapsulated || !subgraph_context_.is_ep_ctx_graph)) { + (subgraph_context_.is_ep_ctx_ovir_encapsulated || !subgraph_context_.is_ep_ctx_graph)) { auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph); if (!status.IsOK()) { ORT_THROW(status); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 00a18bb0a45b6..ee74a1b1ee4b3 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -78,24 +78,24 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr // specify absolute path for so_context_file_path. auto model_file_path = [this]() { if (!session_context_.onnx_model_path_name.empty() && - std::filesystem::exists(session_context_.onnx_model_path_name)) return session_context_.onnx_model_path_name; + std::filesystem::exists(session_context_.onnx_model_path_name)) return session_context_.onnx_model_path_name; ORT_ENFORCE(!session_context_.so_context_file_path.empty() && - std::filesystem::path(session_context_.so_context_file_path).is_absolute() && - std::filesystem::exists(session_context_.so_context_file_path), log_tag + - "Context file path must be non-empty & absolute, when using CreateSessionFormArray() API explicitly." - " Please set a valid absolute path for ep.context_file_path in session options."); + std::filesystem::path(session_context_.so_context_file_path).is_absolute() && + std::filesystem::exists(session_context_.so_context_file_path), + log_tag + + "Context file path must be non-empty & absolute, when using CreateSessionFormArray() API explicitly." + " Please set a valid absolute path for ep.context_file_path in session options."); // Return absolute context file path as input to ImportEPCtxOVIREncapsulation() function. return session_context_.so_context_file_path; - }; // If the EPContext node with OVIR Encapsulation, then create // an executable network from EP_CACHE_CONTEXT using read_model() & compile_model() exe_network_ = OVCore::Get()->ImportEPCtxOVIREncapsulation(*model_stream, - hw_target, - device_config, - enable_causallm, - model_file_path()); + hw_target, + device_config, + enable_causallm, + model_file_path()); } else { // If the blob is held in an EPContext node, then skip FE+Compile // and directly move on to creating a backend with the executable blob diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index e2369cf728ea6..6a2b375d733f9 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -64,6 +64,7 @@ class SharedContext : public WeakSingleton { fs::path external_weight_filename; std::unique_ptr mapped_weights; Metadata::Map metadata; + fs::path metadata_filepath; } shared_weights; }; diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc index 49a4cb0a7e95a..9e70756a254aa 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc @@ -131,7 +131,7 @@ std::unique_ptr EPCtxHandler::GetModelBlobStream(const std::filesy // exported with must match the version that is currently running. ORT_ENFORCE((attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_), "EPCtx blob was exported / is compatible with OpenVINO SDK version " + attrs.at(EP_SDK_VER).s() + - ", but OpenVINO SDK version currently in use is " + openvino_sdk_version_); + ", but OpenVINO SDK version currently in use is " + openvino_sdk_version_); } LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 5c8293a213f40..7f6a7909f1dec 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -102,15 +102,24 @@ common::Status OpenVINOExecutionProvider::Compile( graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); } - // Temporary code to read metadata before it moves to the .bin - auto& metadata = shared_context_->shared_weights.metadata; - if (session_context_.so_share_ep_contexts && metadata.empty()) { - // Metadata is always read from model location, this could be a source or epctx model - fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; - std::ifstream file(metadata_filename, std::ios::binary); - if (file) { - file >> metadata; + // The block below is executed during EP context model inference + auto& metadata = shared_context_->shared_weights.metadata; // Metadata object in memory + if (session_context_.so_share_ep_contexts && + !session_context_.so_context_enable && + metadata.empty()) { + fs::path context_model_file_path = session_context_.so_context_file_path; + if (context_model_file_path.empty()) { + // If ep.context_file_path is not set the input model path is used + context_model_file_path = session_context_.onnx_model_path_name; } + + // Metadata is always read from model location, this could be a source or epctx model + fs::path metadata_filename = context_model_file_path.stem().string() + "_metadata.bin"; + fs::path metadata_file_path = context_model_file_path.parent_path() / metadata_filename; + std::ifstream file(metadata_file_path, std::ios::binary); + ORT_RETURN_IF_NOT(file, "Metadata file was not found: " + metadata_file_path.string()); + shared_context_->shared_weights.metadata_filepath = metadata_file_path; + file >> metadata; } struct OpenVINOEPFunctionState { @@ -173,22 +182,29 @@ common::Status OpenVINOExecutionProvider::Compile( } } - if (session_context_.so_share_ep_contexts) { - fs::path metadata_filename; - if (session_context_.so_context_file_path.empty()) { - metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; - } else { - metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin"; + // The block below is executed during EP context model generation + if (session_context_.so_context_enable && + session_context_.so_share_ep_contexts && + !metadata.empty()) { + // For models after the first the metadata name comes from the shared context + fs::path metadata_file_path = shared_context_->shared_weights.metadata_filepath; + if (metadata_file_path.empty()) { + metadata_file_path = session_context_.so_context_file_path; + if (metadata_file_path.empty()) { + metadata_file_path = session_context_.onnx_model_path_name; + } + auto metadata_filename = metadata_file_path.stem().string() + "_metadata.bin"; + metadata_file_path.replace_filename(metadata_filename); + shared_context_->shared_weights.metadata_filepath = metadata_file_path; } // Metadata is generated only for shared contexts - // If saving metadata then save it to the provided path or ose the original model path + // If saving metadata then save it to the provided path or use the original model path // Multiple calls to Compile() will update the metadata and for the last call // the resulting file will contain the aggregated content - std::ofstream file(metadata_filename, std::ios::binary); - if (file) { - file << metadata; - } + std::ofstream file{metadata_file_path, std::ios::binary}; + ORT_RETURN_IF_NOT(file, "Metadata file could not be written: ", metadata_file_path); + file << metadata; } return status; diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 306fa6113b347..918940b9d9917 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -16,7 +16,7 @@ namespace onnxruntime { namespace openvino_ep { template -inline auto OvExceptionBoundary(Func &&func, std::format_string&& fmt, Args&&... args) { +inline auto OvExceptionBoundary(Func&& func, std::format_string&& fmt, Args&&... args) { try { return func(); } catch (const ov::Exception& e) { @@ -47,462 +47,462 @@ void printDebugInfo(const ov::CompiledModel& obj) { continue; OPENVINO_SUPPRESS_DEPRECATED_END std::cout << " " << item2.first << ": " << item2.second.as() << std::endl; + } + } + else { + std::cout << " " << cfg << ": " << prop.as() << std::endl; } - } else { - std::cout << " " << cfg << ": " << prop.as() << std::endl; } } } -} #endif -// Function to check if a given OV property is enabled -std::optional queryOVProperty(const std::string& property, const std::string& device_type) { - try { - // Get the property value - auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties); - return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end(); - } catch (const std::exception&) { - return std::nullopt; // Property not found or invalid - } -} - -std::shared_ptr OVCore::ReadModel(std::string&& model, const std::string& model_path) { - return OvExceptionBoundary([&]() { - std::istringstream modelStringStream(std::move(model)); - std::istream& modelStream = modelStringStream; - // Try to load with FrontEndManager - ov::frontend::FrontEndManager manager; - ov::frontend::FrontEnd::Ptr FE; - ov::frontend::InputModel::Ptr inputModel; - - ov::AnyVector params{&modelStream, model_path}; - - FE = manager.load_by_model(params); - if (FE) { - inputModel = FE->load(params); - return FE->convert(inputModel); - } else { - ORT_THROW(log_tag + "Unknown exception while Reading network"); + // Function to check if a given OV property is enabled + std::optional queryOVProperty(const std::string& property, const std::string& device_type) { + try { + // Get the property value + auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties); + return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end(); + } catch (const std::exception&) { + return std::nullopt; // Property not found or invalid } - }, - "Exception while Reading network"); -} - -OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr& model, - std::string& hw_target, - const ov::AnyMap& device_config) { - ov::CompiledModel compiled_model; - ov::AnyMap config = device_config; - - if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "Stateless OV Model Statistic:" << std::endl; - LogBasicModelInfo(model); } - bool model_status = IsStateful(model); - LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); - if (!model_status) { - LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; - PatchStatefulDecoder(model); - } - - if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "Stateful OV Model Statistic:" << std::endl; - LogBasicModelInfo(model); + std::shared_ptr OVCore::ReadModel(std::string && model, const std::string& model_path) { + return OvExceptionBoundary([&]() { + std::istringstream modelStringStream(std::move(model)); + std::istream& modelStream = modelStringStream; + // Try to load with FrontEndManager + ov::frontend::FrontEndManager manager; + ov::frontend::FrontEnd::Ptr FE; + ov::frontend::InputModel::Ptr inputModel; + + ov::AnyVector params{&modelStream, model_path}; + + FE = manager.load_by_model(params); + if (FE) { + inputModel = FE->load(params); + return FE->convert(inputModel); + } else { + ORT_THROW(log_tag + "Unknown exception while Reading network"); + } + }, + "Exception while Reading network"); } - auto kv_pos = GetKVAxesPos(model); + OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr & model, + std::string & hw_target, + const ov::AnyMap& device_config) { + ov::CompiledModel compiled_model; + ov::AnyMap config = device_config; - if (hw_target.find("NPU") != std::string::npos) { - KVDesc kv_desc; - auto parse_genai_config = [&](const std::string& key, unsigned int default_value) { - return (config.count(key) && !config.at(key).empty() && config.at(key).as() != "0") ? config.at(key).as() : default_value; - }; - - kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len); - kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len); + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "Stateless OV Model Statistic:" << std::endl; + LogBasicModelInfo(model); + } - // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0 - if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) { - ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty"); + bool model_status = IsStateful(model); + LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); + if (!model_status) { + LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; + PatchStatefulDecoder(model); } if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl; - std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl; - std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl; - std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl; + std::cout << "Stateful OV Model Statistic:" << std::endl; + LogBasicModelInfo(model); } - UpdateNPUConfig(config, kv_pos, kv_desc); - } else { - // This patches the OV IR model so that it only produces the logits required for sampling. - // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device, - // while this is here mostly to align this behavior for other devices viz. (CPU, GPU). - ApplySliceBeforeMatmulTransformation(model); - } + auto kv_pos = GetKVAxesPos(model); - LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow"; - compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config); - OVExeNetwork exe(compiled_model, hw_target, true); - return exe; -} + if (hw_target.find("NPU") != std::string::npos) { + KVDesc kv_desc; + auto parse_genai_config = [&](const std::string& key, unsigned int default_value) { + return (config.count(key) && !config.at(key).empty() && config.at(key).as() != "0") ? config.at(key).as() : default_value; + }; + + kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len); + kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len); -OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_network, - std::string& hw_target, - ov::AnyMap& device_config, - bool enable_causallm, - const std::string& name) { - return OvExceptionBoundary([&]() { - OVExeNetwork exe; - if (enable_causallm) { - auto mutable_model = ie_cnn_network->clone(); - exe = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config); + // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0 + if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) { + ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty"); + } + + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl; + std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl; + std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl; + std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl; + } + + UpdateNPUConfig(config, kv_pos, kv_desc); } else { - auto obj = core.compile_model(ie_cnn_network, hw_target, device_config); - exe = OVExeNetwork(obj, hw_target); + // This patches the OV IR model so that it only produces the logits required for sampling. + // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device, + // while this is here mostly to align this behavior for other devices viz. (CPU, GPU). + ApplySliceBeforeMatmulTransformation(model); } + LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow"; + compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config); + OVExeNetwork exe(compiled_model, hw_target, true); + return exe; + } + + OVExeNetwork OVCore::CompileModel(std::shared_ptr & ie_cnn_network, + std::string & hw_target, + ov::AnyMap & device_config, + bool enable_causallm, + const std::string& name) { + return OvExceptionBoundary([&]() { + OVExeNetwork exe; + if (enable_causallm) { + auto mutable_model = ie_cnn_network->clone(); + exe = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config); + } else { + auto obj = core.compile_model(ie_cnn_network, hw_target, device_config); + exe = OVExeNetwork(obj, hw_target); + } + #ifndef NDEBUG - printDebugInfo(exe.Get()); + printDebugInfo(exe.Get()); #endif - return exe; - }, - "Exception while Loading Network for graph {}", name); -} + return exe; + }, + "Exception while Loading Network for graph {}", name); + } -OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, - std::string& hw_target, - ov::AnyMap& device_config, - const std::string& name) { - return OvExceptionBoundary([&]() { - ov::CompiledModel obj; + OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name) { + return OvExceptionBoundary([&]() { + ov::CompiledModel obj; - obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); + obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); #ifndef NDEBUG - printDebugInfo(obj); + printDebugInfo(obj); #endif - OVExeNetwork exe(obj, hw_target); - return exe; - }, - "Exception while Loading Network for graph {}", name); -} + OVExeNetwork exe(obj, hw_target); + return exe; + }, + "Exception while Loading Network for graph {}", name); + } -OVExeNetwork OVCore::ImportModel(std::istream& model_stream, - std::string hw_target, - const ov::AnyMap& device_config, - std::string name) { - return OvExceptionBoundary([&]() { - ov::CompiledModel obj; - obj = core.import_model(model_stream, hw_target, device_config); - OVExeNetwork exe(obj, hw_target); + OVExeNetwork OVCore::ImportModel(std::istream & model_stream, + std::string hw_target, + const ov::AnyMap& device_config, + std::string name) { + return OvExceptionBoundary([&]() { + ov::CompiledModel obj; + obj = core.import_model(model_stream, hw_target, device_config); + OVExeNetwork exe(obj, hw_target); #ifndef NDEBUG - printDebugInfo(exe.Get()); + printDebugInfo(exe.Get()); #endif - return exe; - }, - "Exception while Loading Network for graph {}", name); -} + return exe; + }, + "Exception while Loading Network for graph {}", name); + } -OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream, - std::string& hw_target, - const ov::AnyMap& device_config, - bool enable_causallm, - std::filesystem::path model_file_path) { - return OvExceptionBoundary([&]() { - OVExeNetwork exe; - - bool isXML = backend_utils::IsModelStreamXML(model_stream); - - // Helper function to check if file exists and is readable - const auto check_file_access = [&model_file_path](const std::filesystem::path& path) { - try { - if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) { - ORT_THROW(log_tag + "Required file missing or empty: " + path.string()); - } - std::ifstream file(path); - if (!file) { - ORT_THROW(log_tag + "Required file not readable: " + path.string()); + OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream & model_stream, + std::string & hw_target, + const ov::AnyMap& device_config, + bool enable_causallm, + std::filesystem::path model_file_path) { + return OvExceptionBoundary([&]() { + OVExeNetwork exe; + + bool isXML = backend_utils::IsModelStreamXML(model_stream); + + // Helper function to check if file exists and is readable + const auto check_file_access = [&model_file_path](const std::filesystem::path& path) { + try { + if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) { + ORT_THROW(log_tag + "Required file missing or empty: " + path.string()); + } + std::ifstream file(path); + if (!file) { + ORT_THROW(log_tag + "Required file not readable: " + path.string()); + } + } catch (const std::exception& e) { + ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what()); } - } catch (const std::exception& e) { - ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what()); - } - }; + }; - if (isXML) { - // If the model is XML, we need to load it with the XML content in read_model() - // where weights from bin file is directly consumed - auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml"); + if (isXML) { + // If the model is XML, we need to load it with the XML content in read_model() + // where weights from bin file is directly consumed + auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml"); - check_file_access(xml_file_path); + check_file_access(xml_file_path); - LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string(); + LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string(); - // Load the model explicitly with XML contents - std::shared_ptr model = core.read_model(xml_file_path.string()); + // Load the model explicitly with XML contents + std::shared_ptr model = core.read_model(xml_file_path.string()); - if (enable_causallm) { - exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config); - } else { - auto obj = core.compile_model(model, hw_target, device_config); - exe = OVExeNetwork(obj, hw_target); + if (enable_causallm) { + exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config); + } else { + auto obj = core.compile_model(model, hw_target, device_config); + exe = OVExeNetwork(obj, hw_target); + } } - } #ifndef NDEBUG - printDebugInfo(exe.Get()); + printDebugInfo(exe.Get()); #endif - return exe; - }, - "Exception while Loading Network from OVIR model file: {}", model_file_path.string()); -} - - -void OVCore::SetCache(const std::string& cache_dir_path) { - core.set_property(ov::cache_dir(cache_dir_path)); -} - -std::vector OVCore::GetAvailableDevices() const { - std::vector available_devices = core.get_available_devices(); - return available_devices; -} - -std::vector OVCore::GetAvailableDevices(const std::string& device_type) const { - std::vector available_devices; - std::vector devicesIDs; - // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU) - try { - devicesIDs = core.get_property(device_type, ov::available_devices); - } catch (const ov::Exception&) { - // plugin is not created by e.g. invalid env - // Empty device list will be returned - } catch (const std::exception& ex) { - ORT_THROW(log_tag + "An exception occurred while trying to create the ", - device_type, - " device: ", - ex.what()); - } catch (...) { - ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ", - device_type, - " device"); + return exe; + }, + "Exception while Loading Network from OVIR model file: {}", model_file_path.string()); } - if (devicesIDs.size() > 1 || - (devicesIDs.size() == 1 && devicesIDs[0] == "0")) { - for (const auto& deviceID : devicesIDs) { - available_devices.push_back(device_type + '.' + deviceID); - } - } - if (!devicesIDs.empty()) { - available_devices.push_back(device_type); + void OVCore::SetCache(const std::string& cache_dir_path) { + core.set_property(ov::cache_dir(cache_dir_path)); } - return available_devices; -} - -void OVCore::SetStreams(const std::string& device_type, int num_streams) { - core.set_property(device_type, {ov::num_streams(num_streams)}); -} + std::vector OVCore::GetAvailableDevices() const { + std::vector available_devices = core.get_available_devices(); + return available_devices; + } -std::shared_ptr OVExeNetwork::CreateInferRequest() { - return OvExceptionBoundary([&]() { - auto infReq = compiled_model_obj.create_infer_request(); - std::shared_ptr ovInfReq; - if (is_stateful_causallm) { - ovInfReq = std::make_shared(std::move(infReq), target_device); - } else { - ovInfReq = std::make_shared(std::move(infReq)); + std::vector OVCore::GetAvailableDevices(const std::string& device_type) const { + std::vector available_devices; + std::vector devicesIDs; + // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU) + try { + devicesIDs = core.get_property(device_type, ov::available_devices); + } catch (const ov::Exception&) { + // plugin is not created by e.g. invalid env + // Empty device list will be returned + } catch (const std::exception& ex) { + ORT_THROW(log_tag + "An exception occurred while trying to create the ", + device_type, + " device: ", + ex.what()); + } catch (...) { + ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ", + device_type, + " device"); } - return ovInfReq; - }, - - "Exception while creating InferRequest object"); -} -OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { - return OvExceptionBoundary([&]() { - auto tobj = ovInfReq.get_tensor(input_name); - OVTensorPtr blob = std::make_shared(tobj); - return blob; - }, - " Cannot access IE Blob for input: {}", input_name); -} + if (devicesIDs.size() > 1 || + (devicesIDs.size() == 1 && devicesIDs[0] == "0")) { + for (const auto& deviceID : devicesIDs) { + available_devices.push_back(device_type + '.' + deviceID); + } + } + if (!devicesIDs.empty()) { + available_devices.push_back(device_type); + } -std::string OVInferRequest::GetInputTensorName(uint32_t index) { - return OvExceptionBoundary([&]() { - const auto& model = ovInfReq.get_compiled_model(); - return *model.input(index).get_names().begin(); - }, - " Cannot access IE Blob for input number: {}", index); -} + return available_devices; + } -void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { - OvExceptionBoundary([&]() { - ovInfReq.set_tensor(name, *(blob.get())); - }, - " Cannot set Remote Blob for output: {}", name); -} + void OVCore::SetStreams(const std::string& device_type, int num_streams) { + core.set_property(device_type, {ov::num_streams(num_streams)}); + } -uint32_t OVInferRequest::GetNumInputs() { - return static_cast(ovInfReq.get_compiled_model().inputs().size()); -} + std::shared_ptr OVExeNetwork::CreateInferRequest() { + return OvExceptionBoundary([&]() { + auto infReq = compiled_model_obj.create_infer_request(); + std::shared_ptr ovInfReq; + if (is_stateful_causallm) { + ovInfReq = std::make_shared(std::move(infReq), target_device); + } else { + ovInfReq = std::make_shared(std::move(infReq)); + } + return ovInfReq; + }, -void OVInferRequest::Infer() { - OvExceptionBoundary([&]() { - ovInfReq.infer(); - }, - "In Error Couldn't start Inference"); -} + "Exception while creating InferRequest object"); + } -StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) - : OVInferRequest(std::move(infer_request)), target_device(device) { - bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); - if (gpu_or_npu) { - prefill_use_full_chat_history = true; + OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { + return OvExceptionBoundary([&]() { + auto tobj = ovInfReq.get_tensor(input_name); + OVTensorPtr blob = std::make_shared(tobj); + return blob; + }, + " Cannot access IE Blob for input: {}", input_name); } -} -void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type, - const std::vector& shape, int32_t fill_value) { - ov::Tensor tensor = ov::Tensor(type, shape); - std::fill_n(tensor.data(), tensor.get_size(), fill_value); - ovInfReq.set_tensor(tensor_name, tensor); -} + std::string OVInferRequest::GetInputTensorName(uint32_t index) { + return OvExceptionBoundary([&]() { + const auto& model = ovInfReq.get_compiled_model(); + return *model.input(index).get_names().begin(); + }, + " Cannot access IE Blob for input number: {}", index); + } -void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector& cache) { - auto tensor = ovInfReq.get_tensor(tensor_name); - auto* pData = tensor.data(); - for (size_t i = 0; i < tensor.get_size(); i++) { - cache.emplace_back(pData[i]); + void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { + OvExceptionBoundary([&]() { + ovInfReq.set_tensor(name, *(blob.get())); + }, + " Cannot set Remote Blob for output: {}", name); } -} -void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name, - const std::vector& cache_data) { - auto tensor = ovInfReq.get_tensor(tensor_name); - auto new_shape = tensor.get_shape(); - new_shape[1] = cache_data.size(); + uint32_t OVInferRequest::GetNumInputs() { + return static_cast(ovInfReq.get_compiled_model().inputs().size()); + } - auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape); - auto* pNewData = new_tensor.data(); - std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t)); + void OVInferRequest::Infer() { + OvExceptionBoundary([&]() { + ovInfReq.infer(); + }, + "In Error Couldn't start Inference"); + } - ovInfReq.set_tensor(tensor_name, new_tensor); -} + StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) + : OVInferRequest(std::move(infer_request)), target_device(device) { + bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); + if (gpu_or_npu) { + prefill_use_full_chat_history = true; + } + } -std::optional StatefulOVInferRequest::FindTensor(const std::string& tensor_name) { - // Check if tensor exists by examining input names in the compiled model - const auto& model = ovInfReq.get_compiled_model(); - bool tensor_exists = false; + void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type, + const std::vector& shape, int32_t fill_value) { + ov::Tensor tensor = ov::Tensor(type, shape); + std::fill_n(tensor.data(), tensor.get_size(), fill_value); + ovInfReq.set_tensor(tensor_name, tensor); + } - for (const auto& input : model.inputs()) { - const auto& names = input.get_names(); - if (names.find(tensor_name) != names.end()) { - tensor_exists = true; - break; + void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector& cache) { + auto tensor = ovInfReq.get_tensor(tensor_name); + auto* pData = tensor.data(); + for (size_t i = 0; i < tensor.get_size(); i++) { + cache.emplace_back(pData[i]); } } - if (tensor_exists) { - return ovInfReq.get_tensor(tensor_name); - } + void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name, + const std::vector& cache_data) { + auto tensor = ovInfReq.get_tensor(tensor_name); + auto new_shape = tensor.get_shape(); + new_shape[1] = cache_data.size(); - return std::nullopt; -} + auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape); + auto* pNewData = new_tensor.data(); + std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t)); -void StatefulOVInferRequest::PreProcessInferRequest() { - // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. - // TODO(ankit): Address this issue and implement the fix at the appropriate layer. - FillTensor("beam_idx", ov::element::i32, {1}, 0); + ovInfReq.set_tensor(tensor_name, new_tensor); + } - // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. - if (prefill_use_full_chat_history) { - auto input_ids_tensor = ovInfReq.get_tensor("input_ids"); - CacheTensor("input_ids", cached_input_ids); + std::optional StatefulOVInferRequest::FindTensor(const std::string& tensor_name) { + // Check if tensor exists by examining input names in the compiled model + const auto& model = ovInfReq.get_compiled_model(); + bool tensor_exists = false; - // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists - auto position_ids_opt = FindTensor("position_ids"); - bool has_position_ids = position_ids_opt.has_value(); + for (const auto& input : model.inputs()) { + const auto& names = input.get_names(); + if (names.find(tensor_name) != names.end()) { + tensor_exists = true; + break; + } + } - if (has_position_ids) { - CacheTensor("position_ids", cached_position_ids); + if (tensor_exists) { + return ovInfReq.get_tensor(tensor_name); } - // If we're about to run the prefill model - if (input_ids_tensor.get_size() > 1) { - // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids". - // This indicates that we are running a subsequent prompt (not the initial prefill). - if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) { - // Clear the internal KVCache state. For NPU device, this operation is a no-op. - ovInfReq.reset_state(); + return std::nullopt; + } + + void StatefulOVInferRequest::PreProcessInferRequest() { + // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. + // TODO(ankit): Address this issue and implement the fix at the appropriate layer. + FillTensor("beam_idx", ov::element::i32, {1}, 0); + + // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. + if (prefill_use_full_chat_history) { + auto input_ids_tensor = ovInfReq.get_tensor("input_ids"); + CacheTensor("input_ids", cached_input_ids); - // Set tensors using cached values - SetTensorFromCache("input_ids", cached_input_ids); + // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists + auto position_ids_opt = FindTensor("position_ids"); + bool has_position_ids = position_ids_opt.has_value(); - // Only set position_ids if it exists and we have cached values - if (has_position_ids && !cached_position_ids.empty()) { - SetTensorFromCache("position_ids", cached_position_ids); + if (has_position_ids) { + CacheTensor("position_ids", cached_position_ids); + } + + // If we're about to run the prefill model + if (input_ids_tensor.get_size() > 1) { + // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids". + // This indicates that we are running a subsequent prompt (not the initial prefill). + if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) { + // Clear the internal KVCache state. For NPU device, this operation is a no-op. + ovInfReq.reset_state(); + + // Set tensors using cached values + SetTensorFromCache("input_ids", cached_input_ids); + + // Only set position_ids if it exists and we have cached values + if (has_position_ids && !cached_position_ids.empty()) { + SetTensorFromCache("position_ids", cached_position_ids); + } } } } } -} -void StatefulOVInferRequest::Infer() { - PreProcessInferRequest(); - OVInferRequest::Infer(); -} + void StatefulOVInferRequest::Infer() { + PreProcessInferRequest(); + OVInferRequest::Infer(); + } -void StatefulOVInferRequest::RewindKVCache(size_t index) { - LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index; + void StatefulOVInferRequest::RewindKVCache(size_t index) { + LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index; - if (prefill_use_full_chat_history) { - // Clear the internal KVCache state. For NPU device, this operation is a no-op. - ovInfReq.reset_state(); + if (prefill_use_full_chat_history) { + // Clear the internal KVCache state. For NPU device, this operation is a no-op. + ovInfReq.reset_state(); - // Resize the cached "input_ids" and "position_ids" to the specified index. - if (cached_input_ids.size() > index) { - cached_input_ids.resize(index); - } + // Resize the cached "input_ids" and "position_ids" to the specified index. + if (cached_input_ids.size() > index) { + cached_input_ids.resize(index); + } - if (cached_position_ids.size() > index) { - cached_position_ids.resize(index); - } - } else { - if (index == 0) { - // In this case, since we're resetting the entire KVCache, simply reset the state. - ovInfReq.reset_state(); + if (cached_position_ids.size() > index) { + cached_position_ids.resize(index); + } } else { - // Retrieve KVCache states and trim them to the specified index. - // The following logic is adapted from: - // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329 - auto states = ovInfReq.query_state(); - for (auto& state : states) { - ov::Tensor old_tensor = state.get_state(); - // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size] - auto shape = old_tensor.get_shape(); - - if (shape[2] > index) { - // Update the sequence length dimension to the specified index. - shape[2] = index; - - ov::Coordinate new_shape_begin{0, 0, 0, 0}; - ov::Coordinate new_shape_end{shape}; - - // Create a trimmed tensor with the updated shape. - auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end); - - // Copy the trimmed tensor into a new tensor and update the state. - ov::Tensor new_tensor(old_tensor.get_element_type(), shape); - trimmed_tensor.copy_to(new_tensor); - - state.set_state(new_tensor); + if (index == 0) { + // In this case, since we're resetting the entire KVCache, simply reset the state. + ovInfReq.reset_state(); + } else { + // Retrieve KVCache states and trim them to the specified index. + // The following logic is adapted from: + // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329 + auto states = ovInfReq.query_state(); + for (auto& state : states) { + ov::Tensor old_tensor = state.get_state(); + // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size] + auto shape = old_tensor.get_shape(); + + if (shape[2] > index) { + // Update the sequence length dimension to the specified index. + shape[2] = index; + + ov::Coordinate new_shape_begin{0, 0, 0, 0}; + ov::Coordinate new_shape_end{shape}; + + // Create a trimmed tensor with the updated shape. + auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end); + + // Copy the trimmed tensor into a new tensor and update the state. + ov::Tensor new_tensor(old_tensor.get_element_type(), shape); + trimmed_tensor.copy_to(new_tensor); + + state.set_state(new_tensor); + } } } } } -} } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 0e019342bc86e..fb1757199698b 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -117,7 +117,7 @@ class OVInferRequest { const void* ort_ptr; }; - protected: + protected: ov::InferRequest ovInfReq; std::unordered_map bindings_cache_; @@ -127,7 +127,7 @@ class OVInferRequest { std::string GetInputTensorName(uint32_t index); // Set tensor described param_info and ort_ptr. Overrides shape in param_info with shape_override. Call infer req tensor if ort_ptr is last set. - void SetTensor(const std::string& name, const ov::element::Type &type, const ov::Shape& shape, void* ort_ptr) { + void SetTensor(const std::string& name, const ov::element::Type& type, const ov::Shape& shape, void* ort_ptr) { auto& cached_binding = bindings_cache_[name]; if (cached_binding.ort_ptr != ort_ptr) { auto tensor_ptr = std::make_shared(type, shape, const_cast(ort_ptr)); diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 45ea822685710..88ddde8610c6e 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -38,7 +38,7 @@ GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler, device_type_ = "CPU"; if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true; } else if (enable_qdq_optimizer && device_type_.find("GPU") != std::string::npos) { - npu_qdq_optimizer_enabled = true; // see data_ops.cc ~615 where we check for int16 types for gpu, this may change to a better approach later + npu_qdq_optimizer_enabled = true; // see data_ops.cc ~615 where we check for int16 types for gpu, this may change to a better approach later } #if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5 diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index 99d6e4b7ab5ef..27d8dd7822c41 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -615,7 +615,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { } // experimentally for GPU and qdq stripping mode allow int16 types if (npu_qdq_optimizer_enabled_ && (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)) - return true; + return true; } #ifndef NDEBUG if (openvino_ep::backend_utils::IsDebugEnabled()) {