@@ -10389,7 +10389,7 @@ index bd52fc90b..7d4e3555a 100644
1038910389 if capability < quant_config.get_min_capability():
1039010390 raise ValueError(
1039110391diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
10392- index 89c9b6747..a5be57ce0 100644
10392+ index 89c9b6747..feba4f69f 100644
1039310393--- a/vllm/engine/arg_utils.py
1039410394+++ b/vllm/engine/arg_utils.py
1039510395@@ -210,6 +210,8 @@ class EngineArgs:
@@ -10420,7 +10420,7 @@ index 89c9b6747..a5be57ce0 100644
1042010420 parser.add_argument(
1042110421 "--disable-cascade-attn",
1042210422 action="store_true",
10423- @@ -1061,6 +1075,8 @@ class EngineArgs:
10423+ @@ -1061,10 +1075,16 @@ class EngineArgs:
1042410424 override_generation_config=self.override_generation_config,
1042510425 enable_sleep_mode=self.enable_sleep_mode,
1042610426 model_impl=self.model_impl,
@@ -10429,7 +10429,26 @@ index 89c9b6747..a5be57ce0 100644
1042910429 )
1043010430
1043110431 def create_load_config(self) -> LoadConfig:
10432- @@ -1504,12 +1520,13 @@ class EngineArgs:
10432+
10433+ + use_low_bit_loader = False
10434+ +
10435+ + if self.low_bit_model_path is not None:
10436+ + use_low_bit_loader = True
10437+ if(self.qlora_adapter_name_or_path is not None) and \
10438+ self.quantization != "bitsandbytes":
10439+ raise ValueError(
10440+ @@ -1079,8 +1099,10 @@ class EngineArgs:
10441+ model_loader_extra_config=self.model_loader_extra_config,
10442+ ignore_patterns=self.ignore_patterns,
10443+ use_tqdm_on_load=self.use_tqdm_on_load,
10444+ + use_low_bit_loader=use_low_bit_loader,
10445+ )
10446+
10447+ +
10448+ def create_speculative_config(
10449+ self,
10450+ target_model_config: ModelConfig,
10451+ @@ -1504,12 +1526,13 @@ class EngineArgs:
1043310452 _raise_or_fallback(feature_name=name, recommend_to_remove=True)
1043410453 return False
1043510454
@@ -12669,6 +12688,23 @@ index c190a4585..dda2a96cc 100644
1266912688 boi = self.boi.expand(x.shape[0], -1, -1)
1267012689 eoi = self.eoi.expand(x.shape[0], -1, -1)
1267112690 x = torch.cat((boi, x, eoi), dim=1)
12691+ diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
12692+ index cb0379c10..5e8b22ab0 100644
12693+ --- a/vllm/model_executor/models/idefics2_vision_model.py
12694+ +++ b/vllm/model_executor/models/idefics2_vision_model.py
12695+ @@ -144,8 +144,10 @@ class Idefics2VisionAttention(nn.Module):
12696+ )
12697+ self.tp_size = get_tensor_model_parallel_world_size()
12698+ self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
12699+ - self.attn = MultiHeadAttention(self.num_heads_per_partition,
12700+ - self.head_dim, self.scale)
12701+ + # self.attn = MultiHeadAttention(self.num_heads_per_partition,
12702+ + # self.head_dim, self.scale)
12703+ + from vllm.model_executor.models.siglip import SelfAttention
12704+ + self.attn = SelfAttention(self.num_heads_per_partition, self.head_dim, self.scale)
12705+
12706+ def forward(
12707+ self,
1267212708diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
1267312709index 5fab9df3f..f8e6fbe24 100644
1267412710--- a/vllm/model_executor/models/minicpmv.py
@@ -13552,6 +13588,18 @@ index 000000000..d96085f46
1355213588+ hidden_states=encoder_outputs.hidden_states,
1355313589+ attentions=encoder_outputs.attentions,
1355413590+ )
13591+ diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
13592+ index db90848f9..5eabcf653 100644
13593+ --- a/vllm/model_executor/models/phi4mm_audio.py
13594+ +++ b/vllm/model_executor/models/phi4mm_audio.py
13595+ @@ -230,6 +230,7 @@ class ConformerEncoderLayer(nn.Module):
13596+ x = x + 0.5 * self.feed_forward_in(x)
13597+ norm_x = self.layer_norm_att(x)
13598+
13599+ + mask = mask.to(x.device)
13600+ x = x + self.self_attn(
13601+ norm_x,
13602+ norm_x,
1355513603diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
1355613604index c4d02e5dd..2831a5a12 100644
1355713605--- a/vllm/model_executor/models/qwen2.py
@@ -13589,41 +13637,85 @@ index c4d02e5dd..2831a5a12 100644
1358913637 )
1359013638
1359113639diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
13592- index 1e6ff1fec..e2480326a 100644
13640+ index 1e6ff1fec..90ebe5ca9 100644
1359313641--- a/vllm/model_executor/models/qwen2_5_vl.py
1359413642+++ b/vllm/model_executor/models/qwen2_5_vl.py
13595- @@ -304,6 +304,10 @@ class Qwen2_5_VisionAttention(nn.Module):
13643+ @@ -302,23 +302,33 @@ class Qwen2_5_VisionAttention(nn.Module):
13644+ "(b s) ... -> b s ...",
13645+ b=batch_size)
1359613646 elif self.attn_backend == _Backend.TORCH_SDPA:
13597- # Execute attention entry by entry for speed & less VRAM.
13598- outputs = []
13599- + head_dim = q.shape[-1]
13600- + import math
13601- + import xe_addons
13602- + scale = 1 / math.sqrt(head_dim)
13603- for i in range(1, len(cu_seqlens)):
13604- start_idx = cu_seqlens[i - 1]
13605- end_idx = cu_seqlens[i]
13606- @@ -312,10 +316,16 @@ class Qwen2_5_VisionAttention(nn.Module):
13607- v_i = v[:, start_idx:end_idx]
13608- q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
13609- for x in [q_i, k_i, v_i])
13647+ - # Execute attention entry by entry for speed & less VRAM.
13648+ - outputs = []
13649+ - for i in range(1, len(cu_seqlens)):
13650+ - start_idx = cu_seqlens[i - 1]
13651+ - end_idx = cu_seqlens[i]
13652+ - q_i = q[:, start_idx:end_idx]
13653+ - k_i = k[:, start_idx:end_idx]
13654+ - v_i = v[:, start_idx:end_idx]
13655+ - q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
13656+ - for x in [q_i, k_i, v_i])
1361013657- output_i = F.scaled_dot_product_attention(q_i,
1361113658- k_i,
1361213659- v_i,
1361313660- dropout_p=0.0)
13614- + # output_i = F.scaled_dot_product_attention(q_i,
13615- + # k_i,
13616- + # v_i,
13617- + # dropout_p=0.0)
13618- + output_i = xe_addons.sdp_non_causal(
13619- + q_i.contiguous(),
13620- + k_i.contiguous(),
13621- + v_i.contiguous(),
13622- + None,
13623- + scale)
13624- output_i = rearrange(output_i, "b h s d -> b s h d ")
13625- outputs.append(output_i)
13626- context_layer = torch.cat(outputs, dim=1)
13661+ - output_i = rearrange(output_i, "b h s d -> b s h d ")
13662+ - outputs.append(output_i)
13663+ - context_layer = torch.cat(outputs, dim=1)
13664+ + # TODO(xiangyu): Maybe add attn_backend xpu?
13665+ + q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
13666+ + from vllm._ipex_ops import ipex_ops
13667+ + output = torch.empty(
13668+ + (q.shape[0], q.shape[1], q.shape[2]),
13669+ + dtype=q.dtype,
13670+ + device=q.device)
13671+ + import math
13672+ + head_dim = q.shape[-1]
13673+ + scale = 1 / math.sqrt(head_dim)
13674+ + ipex_ops.varlen_attention(q, k, v, output,
13675+ + cu_seqlens,
13676+ + cu_seqlens,
13677+ + max_seqlen,
13678+ + max_seqlen,
13679+ + pdropout=0,
13680+ + softmax_scale=scale,
13681+ + zero_tensors=False,
13682+ + is_causal=False,
13683+ + return_softmax=False,
13684+ + gen_=None,
13685+ + logits_soft_cap=0
13686+ + )
13687+ +
13688+ + context_layer = rearrange(output,
13689+ + "(b s) ... -> b s ...",
13690+ + b=batch_size)
13691+ elif self.attn_backend == _Backend.XFORMERS:
13692+ from xformers import ops as xops
13693+ from xformers.ops.fmha.attn_bias import BlockDiagonalMask
13694+ @@ -613,10 +623,11 @@ class Qwen2_5_VisionTransformer(nn.Module):
13695+ cu_seqlens: torch.Tensor,
13696+ ) -> tuple[Optional[int], Optional[list[int]]]:
13697+ max_seqlen, seqlens = None, None
13698+ - if self.attn_backend == _Backend.FLASH_ATTN:
13699+ - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
13700+ - elif self.attn_backend == _Backend.XFORMERS:
13701+ - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
13702+ + # if self.attn_backend == _Backend.FLASH_ATTN:
13703+ + # max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
13704+ + # elif self.attn_backend == _Backend.XFORMERS:
13705+ + # seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
13706+ + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
13707+ return max_seqlen, seqlens
13708+
13709+ def forward(
13710+ @@ -1082,7 +1093,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
13711+ image_input=image_input,
13712+ video_input=video_input)
13713+ input_ids = None
13714+ -
13715+ +
13716+ hidden_states = self.language_model.model(
13717+ input_ids=input_ids,
13718+ positions=positions,
1362713719diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
1362813720index a7800d415..26af87512 100644
1362913721--- a/vllm/model_executor/models/qwen2_vl.py
@@ -15133,10 +15225,10 @@ index c271f438e..cf7180606 100755
1513315225 assert sliding_window == (-1, -1), (
1513415226diff --git a/vllm/v1/attention/backends/ipex_attn.py b/vllm/v1/attention/backends/ipex_attn.py
1513515227new file mode 100644
15136- index 000000000..f4a435eaa
15228+ index 000000000..964696cfe
1513715229--- /dev/null
1513815230+++ b/vllm/v1/attention/backends/ipex_attn.py
15139- @@ -0,0 +1,392 @@
15231+ @@ -0,0 +1,404 @@
1514015232+from dataclasses import dataclass
1514115233+from typing import Any, Dict, List, Optional, Tuple, Type
1514215234+
@@ -15152,6 +15244,10 @@ index 000000000..f4a435eaa
1515215244+from vllm.attention.backends.ipex_attn import use_gqa_kernel
1515315245+from vllm.utils import is_bmg_platform
1515415246+import os
15247+ +from vllm.logger import init_logger
15248+ +
15249+ +logger = init_logger(__name__)
15250+ +
1515515251+
1515615252+@dataclass
1515715253+class IPEXAttentionMetadata(FlashAttentionMetadata):
@@ -15246,6 +15342,12 @@ index 000000000..f4a435eaa
1524615342+ "are not implemented for "
1524715343+ "IpexAttnBackendImpl")
1524815344+
15345+ + flag = os.getenv("IPEX_LLM_PREFILL_VARLEN_BACKEND", None)
15346+ + self.ipex_varlen_attn = False
15347+ + if flag is not None:
15348+ + self.ipex_varlen_attn = True
15349+ + logger.info_once(f"V1 engine using varlen_attention for prefilling.")
15350+ +
1524915351+ def forward(
1525015352+ self,
1525115353+ layer: AttentionLayer,
@@ -15293,6 +15395,7 @@ index 000000000..f4a435eaa
1529315395+ self.sliding_window,
1529415396+ self.alibi_slopes,
1529515397+ self.logits_soft_cap,
15398+ + self.ipex_varlen_attn,
1529615399+ )
1529715400+ return output.view(-1, self.num_heads * self.head_size)
1529815401+
@@ -15367,6 +15470,7 @@ index 000000000..f4a435eaa
1536715470+ sliding_window: Optional[List[int]] = None,
1536815471+ alibi_slopes: Optional[torch.Tensor] = None,
1536915472+ logits_soft_cap: Optional[float] = None,
15473+ + flag: Optional[bool] = False,
1537015474+) -> None:
1537115475+ context = get_forward_context()
1537215476+ current_metadata = context.attn_metadata
@@ -15382,7 +15486,7 @@ index 000000000..f4a435eaa
1538215486+ key = key.view(-1, num_kv_heads, head_size)
1538315487+ value = value.view(-1, num_kv_heads, head_size)
1538415488+
15385- + if is_bmg_platform:
15489+ + if flag or is_bmg_platform:
1538615490+ key_cache, value_cache = kv_cache.unbind(0)
1538715491+ ipex_ops.reshape_and_cache_flash(
1538815492+ key[:num_actual_tokens],
@@ -17087,7 +17191,7 @@ index 000000000..dffc7b367
1708717191+ return (attn_metadata, encoder_input_tokens_tensor,
1708817192+ encoder_input_positions_tensor)
1708917193diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
17090- index 9d49b4385..78e0c54f2 100644
17194+ index 9d49b4385..dc5e95f4e 100644
1709117195--- a/vllm/worker/xpu_model_runner.py
1709217196+++ b/vllm/worker/xpu_model_runner.py
1709317197@@ -5,8 +5,8 @@ import time
@@ -17735,15 +17839,17 @@ index 9d49b4385..78e0c54f2 100644
1773517839 max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
1773617840 self.model_config)
1773717841 if max_mm_tokens > 0:
17738- @@ -461,6 +820,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17842+ @@ -461,6 +820,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1773917843 "Computed max_num_seqs (%s) to be less than 1. "
1774017844 "Setting it to the minimum value of 1.", expr)
1774117845 max_num_seqs = 1
1774217846+ '''
17847+ + if "phi4mm" in self.model_config.hf_config.model_type:
17848+ + max_num_seqs = 1
1774317849
1774417850 batch_size = 0
1774517851 for group_id in range(max_num_seqs):
17746- @@ -479,11 +839 ,14 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17852+ @@ -479,11 +841 ,14 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1774717853 seq_data={group_id: dummy_data.seq_data},
1774817854 sampling_params=sampling_params,
1774917855 block_tables=None,
@@ -17759,7 +17865,7 @@ index 9d49b4385..78e0c54f2 100644
1775917865 finished_requests_ids = [seq.request_id for seq in seqs]
1776017866 model_input = self.prepare_model_input(
1776117867 seqs, finished_requests_ids=finished_requests_ids)
17762- @@ -493,25 +856 ,39 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17868+ @@ -493,25 +858 ,39 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1776317869 batch_size=batch_size,
1776417870 dtype=self.model_config.dtype,
1776517871 device=self.device)
@@ -17810,7 +17916,7 @@ index 9d49b4385..78e0c54f2 100644
1781017916 """Helper method to prepare the model input based on a given sequence
1781117917 group. Prepares metadata needed for the base model forward pass but not
1781217918 metadata for possible additional steps, e.g., sampling.
17813- @@ -524,6 +901 ,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17919+ @@ -524,6 +903 ,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1781417920
1781517921 return builder.build() # type: ignore
1781617922
@@ -17833,7 +17939,7 @@ index 9d49b4385..78e0c54f2 100644
1783317939 def prepare_model_input(
1783417940 self,
1783517941 seq_group_metadata_list: List[SequenceGroupMetadata],
17836- @@ -563,6 +956 ,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17942+ @@ -563,6 +958 ,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1783717943 raise ValueError(
1783817944 "XPUModelRunner does not support multi-step execution.")
1783917945
@@ -17846,7 +17952,7 @@ index 9d49b4385..78e0c54f2 100644
1784617952 model_executable = self.model
1784717953 if (self.observability_config is not None
1784817954 and self.observability_config.collect_model_forward_time):
17849- @@ -612,3 +1011 ,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17955+ @@ -612,3 +1013 ,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1785017956 output.model_forward_time = model_forward_time
1785117957
1785217958 return [output]
0 commit comments