update: Fix optional position ids caching logic

ankitm3k · ankitm3k · commit a5ac79d01c78 · 2025-06-05T15:26:04.000+05:30
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -142,7 +142,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
     };
   }
   inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer)));
-  bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_);
+  bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_, session_context_);
 }
 
 bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -42,7 +42,7 @@ struct OnnxToOvNetworkBindings {
   std::vector<ParameterInfo> network_outputs_;
   std::vector<ParameterInfo> network_inputs_;
 
-  OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context) {
+  OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context, SessionContext& session_context) {
     auto populate = [&](auto& input_output_map, const SubGraphContext::string_index_map_t& onnx_input_map, const auto& ov_parameters) {
       for (const auto& [onnx_name, onnx_param_index] : onnx_input_map) {
         auto it = std::find_if(ov_parameters.begin(), ov_parameters.end(),
@@ -51,9 +51,10 @@ struct OnnxToOvNetworkBindings {
         // For Stateful Model Compilation, the ONNX model includes KV cache (past/present) tensors.
         // However, these tensors are internally converted to a stateful representation, which removes them.
         // To prevent runtime exceptions, we simply continue processing here.
-        if (onnx_name.empty() || onnx_name == "beam_idx" ||
+        if ((onnx_name.empty() || onnx_name == "beam_idx" ||
             onnx_name.find("past_key_values") != std::string::npos ||
-            onnx_name.find("present") != std::string::npos) {
+            onnx_name.find("present") != std::string::npos) &&
+            session_context.enable_causallm) {
           continue;
         }
 
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -391,7 +391,7 @@ StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, s
   }
 }
 
-void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, const ov::element::Type& type,
+void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type,
                                          const std::vector<size_t>& shape, int32_t fill_value) {
   ov::Tensor tensor = ov::Tensor(type, shape);
   std::fill_n(tensor.data<int32_t>(), tensor.get_size(), fill_value);
@@ -419,16 +419,43 @@ void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name,
   ovInfReq.set_tensor(tensor_name, new_tensor);
 }
 
+std::optional<ov::Tensor> StatefulOVInferRequest::FindTensor(const std::string& tensor_name) {
+  // Check if tensor exists by examining input names in the compiled model
+  const auto& model = ovInfReq.get_compiled_model();
+  bool tensor_exists = false;
+
+  for (const auto& input : model.inputs()) {
+    const auto& names = input.get_names();
+    if (names.find(tensor_name) != names.end()) {
+      tensor_exists = true;
+      break;
+    }
+  }
+
+  if (tensor_exists) {
+    return ovInfReq.get_tensor(tensor_name);
+  }
+
+  return std::nullopt;
+}
+
 void StatefulOVInferRequest::PreProcessInferRequest() {
   // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
   // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
-  CacheTensor("beam_idx", ov::element::i32, {1}, 0);
+  FillTensor("beam_idx", ov::element::i32, {1}, 0);
 
-  // If 'prefill full chat history' mode is enabled, we need to cache input_ids and position_ids.
+  // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
   if (prefill_use_full_chat_history) {
     auto input_ids_tensor = ovInfReq.get_tensor("input_ids");
     CacheTensor("input_ids", cached_input_ids);
-    CacheTensor("position_ids", cached_position_ids);
+
+    // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists
+    auto position_ids_opt = FindTensor("position_ids");
+    bool has_position_ids = position_ids_opt.has_value();
+
+    if (has_position_ids) {
+      CacheTensor("position_ids", cached_position_ids);
+    }
 
     // If we're about to run the prefill model
     if (input_ids_tensor.get_size() > 1) {
@@ -440,7 +467,11 @@ void StatefulOVInferRequest::PreProcessInferRequest() {
 
         // Set tensors using cached values
         SetTensorFromCache("input_ids", cached_input_ids);
-        SetTensorFromCache("position_ids", cached_position_ids);
+
+        // Only set position_ids if it exists and we have cached values
+        if (has_position_ids && !cached_position_ids.empty()) {
+          SetTensorFromCache("position_ids", cached_position_ids);
+        }
       }
     }
   }
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -146,10 +146,11 @@ class StatefulOVInferRequest : public OVInferRequest {
   void StartAsync() override;
   void Infer() override;
   void RewindKVCache(size_t index) override;
-  void CacheTensor(const std::string& tensor_name, const ov::element::Type& type,
+  void FillTensor(const std::string& tensor_name, const ov::element::Type& type,
                    const std::vector<size_t>& shape, int32_t fill_value);
   void CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache);
   void SetTensorFromCache(const std::string& tensor_name, const std::vector<int64_t>& cache_data);
+  std::optional<ov::Tensor> FindTensor(const std::string& tensor_name);
 
  private:
   void PreProcessInferRequest();

Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr`
`142`	`142`	`};`
`143`	`143`	`}`
`144`	`144`	`inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer)));`
`145`		`- bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_);`
	`145`	`+ bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_, session_context_);`
`146`	`146`	`}`
`147`	`147`
`148`	`148`	`bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {`