@@ -62,25 +62,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
6262 try {
6363 // IO_BUFFER is enabled on GPU HW.
6464 // Pre-requisite is provider_option "context" must be set
65- #if defined(IO_BUFFER_ENABLED)
66- cl_context ctx = static_cast <cl_context>(session_context_.context );
67- remote_context_ = new ov::intel_gpu::ocl::ClContext (OVCore::Get ()->core , ctx);
68- if (subgraph_context_.is_ep_ctx_graph ) {
69- exe_network_ = OVCore::Get ()->ImportModel (*model_stream,
70- remote_context_,
71- subgraph_context_.subgraph_name );
72- model_stream.reset (); // Delete stream after it is no longer needed
73- } else {
74- std::string model = model_proto->SerializeAsString ();
75- if (!subgraph_context.has_dynamic_input_shape ) {
76- model_proto.reset ()
77- }
78- auto ov_model = CreateOVModel (std::move (model), session_context_, const_outputs_map_);
79- LOGS_DEFAULT (INFO) << log_tag << " IO Buffering Enabled" ;
80- exe_network_ = OVCore::Get ()->CompileModel (
81- ov_model, remote_context_, subgraph_context_.subgraph_name );
82- }
83- #else // !IO_BUFFER_ENABLED
8465 auto auto_unified_compile = ((hw_target.find (" AUTO" ) == std::string::npos) ||
8566 (session_context_.OpenVINO_Version .at (0 ) >= 2024 &&
8667 session_context_.OpenVINO_Version .at (1 ) > 2 ));
@@ -117,7 +98,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
11798 exe_network_ = OVCore::Get ()->CompileModel (
11899 ov_model, hw_target, device_config, subgraph_context_.subgraph_name );
119100 }
120- #endif
121101 LOGS_DEFAULT (INFO) << log_tag << " Loaded model to the plugin" ;
122102 } catch (const char * msg) {
123103 ORT_THROW (msg);
@@ -459,150 +439,46 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
459439 }
460440}
461441
462- #ifdef IO_BUFFER_ENABLED
463- // Wait for Remote Aynchronous inference completion
464- void BasicBackend::StartRemoteAsyncInference (Ort::KernelContext& context, OVInferRequestPtr infer_request) {
465- try {
466- auto graph_input_info = exe_network_.Get ().inputs ();
467- int input_idx = 0 ;
468- for (auto input_info_iter = graph_input_info.begin ();
469- input_info_iter != graph_input_info.end (); ++input_info_iter) {
470- auto input_names = input_info_iter->get_names ();
471- std::string onnx_input_name;
472- std::string input_name;
473- // use names retrieved from original ONNX model to assign the right onnx input name for the graph
474- for (auto it = subgraph_context_.input_names .begin (); it != subgraph_context_.input_names .end (); ++it) {
475- if (it->second == input_idx) {
476- onnx_input_name = it->first ;
477- break ;
478- }
479- }
480- // using the input name retrieved from ONNX original to match with the input names returned by OV tensors
481- if (input_names.find (onnx_input_name) != input_names.end ()) {
482- input_name = onnx_input_name;
483- } else {
484- ORT_THROW (log_tag +
485- " Input names mismatch between OpenVINO and ONNX. " +
486- onnx_input_name +
487- " doesn't exist in the list of OpenVINO input tensor names" );
488- }
489- input_idx++;
490- // Kernel Context Input Buffer
491- const auto tensor = context.GetInput (subgraph_context_.input_names .at (input_name));
492- // If the ORTValue wraps a device pointer
493- auto mem_info = tensor.GetTensorMemoryInfo ();
494- if (mem_info.GetAllocatorName () == OpenVINO_GPU) {
495- // Get the shared buffer pointer
496- const void * tensor_data = tensor.GetTensorRawData ();
497- const cl::Buffer* shared_buffer_const = static_cast <const cl::Buffer*>(tensor_data);
498- // Create an Input Remote Blob
499- auto input = graph_input_info.at (0 );
500- auto remote_blob = remote_context_->create_tensor (
501- input.get_element_type (), input.get_shape (), *shared_buffer_const);
502- ov::Tensor tensor_remote = static_cast <ov::Tensor>(remote_blob);
503- OVTensorPtr tensor_ptr = std::make_shared<ov::Tensor>(tensor_remote);
504- infer_request->SetTensor (input_name, tensor_ptr);
505- } else {
506- OVTensorPtr graph_input_blob;
507- graph_input_blob = infer_request->GetTensor (input_name);
508- size_t batch_slice_idx = 0 ;
509- FillInputBlob (graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_);
510- }
511- }
512-
513- // Set the output blob as remote blob
514- auto graph_output_info = exe_network_.Get ().outputs ();
515- for (auto output_info_iter = graph_output_info.begin ();
516- output_info_iter != graph_output_info.end (); ++output_info_iter) {
517- auto output_names = output_info_iter->get_names ();
518- std::string onnx_output_name;
519- std::string output_name;
520- bool output_name_found = false ;
521- // using the output name retrieved from ONNX original to match with the output names returned by OV tensors
522- for (auto it = subgraph_context_.output_names .begin (); it != subgraph_context_.output_names .end (); ++it) {
523- onnx_output_name = it->first ;
524- if (output_names.find (onnx_output_name) != output_names.end ()) {
525- // Assigning the output_name
526- output_name = it->first ;
527- output_name_found = true ;
528- break ;
529- }
530- }
531- if (!output_name_found) {
532- ORT_THROW (
533- log_tag +
534- " Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " +
535- onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names" );
536- }
537-
538- size_t batch_size = 1 ;
539- Ort::UnownedValue tensor = GetOutputTensor (context,
540- batch_size,
541- infer_request,
542- output_name,
543- subgraph_context_.output_names );
544- auto mem_info = tensor.GetTensorMemoryInfo ();
545- // Check if ORT Value wraps a device pointer
546- if (mem_info.GetAllocatorName () == OpenVINO_GPU) {
547- const void * tensor_data = tensor.GetTensorRawData ();
548- const cl::Buffer* shared_buffer_const = static_cast <const cl::Buffer*>(tensor_data);
549- // Create a shared Blob, set the Infer Request Output Blob
550- auto output = graph_output_info.at (0 );
551- auto remote_tensor =
552- remote_context_->create_tensor (output.get_element_type (), output.get_shape (), *shared_buffer_const);
553- ov::Tensor tensor_t = static_cast <ov::Tensor>(remote_tensor);
554- OVTensorPtr tensor_ptr = std::make_shared<ov::Tensor>(tensor_t );
555- try {
556- infer_request->SetTensor (output_name, tensor_ptr);
557- } catch (const char * msg) {
558- ORT_THROW (msg);
559- }
560- }
561- }
562-
563- // Start Async inference
564- infer_request->StartAsync ();
565- } catch (const char * msg) {
566- ORT_THROW (msg);
567- }
568- }
569- #endif
570-
571442// Wait for asynchronous inference completion on an Infer Request object indexed by infer_req_idx
572443// and copy the results into a slice location within the batched output buffer indexed by batch_slice_idx
573444void BasicBackend::CompleteAsyncInference (Ort::KernelContext& context, OVInferRequestPtr infer_request) {
574445 // Wait for Async inference completion
575446 try {
576- bool cpu_or_gpu = session_context_.device_type .find (" CPU" ) != std::string::npos ||
577- session_context_.device_type .find (" GPU" ) != std::string::npos;
578-
579447 infer_request->WaitRequest ();
448+ } catch (const std::runtime_error& e) {
449+ infer_request->CancelRequest ();
450+ inferRequestsQueue_->deleteRequest ();
451+ ORT_THROW (log_tag + e.what ());
452+ }
580453
581- if (cpu_or_gpu) {
582- for (const auto & output_info : bindings_->network_outputs_ ) {
583- OVTensorPtr graph_output_blob;
584- try {
585- graph_output_blob = infer_request->GetTensor (output_info.name );
586- } catch (const char * msg) {
587- ORT_THROW (msg);
588- }
589- size_t batch_size = 1 ;
590- Ort::UnownedValue output_tensor =
591- GetOutputTensor (context, batch_size, infer_request, output_info.name , subgraph_context_.output_names );
592- auto mem_info = output_tensor.GetTensorMemoryInfo ();
593- if (mem_info.GetAllocatorName () == OpenVINO_GPU) {
454+ bool cpu_or_gpu = session_context_.device_type .find (" CPU" ) != std::string::npos ||
455+ session_context_.device_type .find (" GPU" ) != std::string::npos;
456+ if (cpu_or_gpu) {
457+ for (const auto & output_info : bindings_->network_outputs_ ) {
458+ OVTensorPtr graph_output_blob;
459+ try {
460+ graph_output_blob = infer_request->GetTensor (output_info.name );
461+ } catch (const char * msg) {
462+ ORT_THROW (msg);
463+ }
464+ size_t batch_size = 1 ;
465+ Ort::UnownedValue output_tensor =
466+ GetOutputTensor (context, batch_size, infer_request, output_info.name , subgraph_context_.output_names );
467+ auto mem_info = output_tensor.GetTensorMemoryInfo ();
468+ if (mem_info.GetAllocatorName () == OpenVINO_GPU) {
594469 return ;
595- } else {
596- size_t batch_slice = 0 ;
597- FillOutputBlob (std::move (graph_output_blob), output_tensor, batch_slice);
598- }
470+ } else {
471+ size_t batch_slice = 0 ;
472+ FillOutputBlob (std::move (graph_output_blob), output_tensor, batch_slice);
599473 }
600474 }
475+ }
601476
602- if (!const_outputs_map_.empty ()) {
603- for (const auto & item : const_outputs_map_) {
604- const auto & out_name = item.first ;
605- auto node = item.second ;
477+ if (!const_outputs_map_.empty ()) {
478+ for (const auto & item : const_outputs_map_) {
479+ const auto & out_name = item.first ;
480+ auto node = item.second ;
481+ try {
606482 Ort::UnownedValue output_tensor = GetOutputTensor (context,
607483 out_name,
608484 subgraph_context_.output_names ,
@@ -613,10 +489,10 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
613489 } else {
614490 FillOutputsWithConstantData (std::move (node), output_tensor);
615491 }
492+ } catch (std::string const & msg) {
493+ ORT_THROW (msg);
616494 }
617495 }
618- } catch (const char * msg) {
619- ORT_THROW (msg);
620496 }
621497}
622498
@@ -650,31 +526,20 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
650526 }
651527
652528 } else {
653- // Requesting for an idle infer_request from a pool of infer_requests_
654529 OVInferRequestPtr infer_request;
655530 infer_request = inferRequestsQueue_->getIdleRequest ();
656- #ifdef IO_BUFFER_ENABLED
657- if ((session_context_.device_type .find (" GPU" ) != std::string::npos) &&
658- (session_context_.context != nullptr ) && session_context_.is_wholly_supported_graph ) {
659- try {
660- StartRemoteAsyncInference (context, infer_request);
661- } catch (std::string const & msg) {
662- ORT_THROW (msg);
663- }
664- } else {
665- try {
666- StartAsyncInference (context, infer_request);
667- } catch (std::string const & msg) {
668- ORT_THROW (msg);
669- }
531+ if (infer_request == nullptr ) {
532+ ORT_THROW (" OpenVINO Execution Provider :: There are no inference requests" );
533+ LOGS_DEFAULT (FATAL) << log_tag << " Create Infer Requests do not exist" ;
534+ return ;
670535 }
671- #else
536+
537+ LOGS_DEFAULT (INFO) << log_tag << " Get Idle Request" ;
672538 try {
673539 StartAsyncInference (context, infer_request);
674540 } catch (const std::runtime_error& e) {
675541 ORT_THROW (log_tag + " Exception at StartAsyncInference: " + e.what ());
676542 }
677- #endif
678543 try {
679544 CompleteAsyncInference (context, infer_request);
680545 } catch (const std::runtime_error& e) {
@@ -696,13 +561,11 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
696561 // Once the inference is completed, the infer_request becomes free and is placed back into pool of infer_requests_
697562 inferRequestsQueue_->putIdleRequest (std::move (infer_request));
698563#ifndef NDEBUG
699- #ifndef IO_BUFFER_ENABLED // Printing performance counts is disabled when IO_BUFFER_ENABLED
700564 if (openvino_ep::backend_utils::IsDebugEnabled ()) {
701565 inferRequestsQueue_->printstatus (); // Printing the elements of infer_requests_ vector pool only in debug mode
702566 std::string& hw_target = session_context_.device_type ;
703567 printPerformanceCounts (std::move (infer_request_), std::cout, hw_target);
704568 }
705- #endif
706569#endif
707570 }
708571}
0 commit comments