Skip to content

Commit 5938570

Browse files
OVEP Stateful: Improve accuracy on NPU for sequence lengths >= 2048
1 parent c531a61 commit 5938570

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

onnxruntime/core/providers/openvino/ov_interface.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,13 @@ OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
131131
}
132132

133133
UpdateNPUConfig(config, kv_pos, kv_desc);
134+
135+
if ((kv_desc.max_prompt_len + kv_desc.min_response_len) >= 2048) {
136+
// This improves accuracy for generation sequences that exceed 2k tokens.
137+
config["++NPUW_LLM_PREFILL_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "NPU,CPU"}, {"NPUW_ONLINE_AVOID", "P:SinCos/NPU"}};
138+
config["++NPUW_LLM_GENERATE_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "NPU,CPU"}, {"NPUW_ONLINE_AVOID", "P:SinCos/NPU"}};
139+
}
140+
134141
} else {
135142
// This patches the OV IR model so that it only produces the logits required for sampling.
136143
// Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device,

0 commit comments

Comments
 (0)