Skip to content

Commit 5a629ae

Browse files
liu-shaojungc-fu
andauthored
update vllm patch (#13211)
Co-authored-by: gc-fu <[email protected]>
1 parent ac04992 commit 5a629ae

File tree

1 file changed

+147
-41
lines changed

1 file changed

+147
-41
lines changed

docker/llm/serving/xpu/docker/vllm_for_multi_arc.patch

Lines changed: 147 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -10389,7 +10389,7 @@ index bd52fc90b..7d4e3555a 100644
1038910389
if capability < quant_config.get_min_capability():
1039010390
raise ValueError(
1039110391
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
10392-
index 89c9b6747..a5be57ce0 100644
10392+
index 89c9b6747..feba4f69f 100644
1039310393
--- a/vllm/engine/arg_utils.py
1039410394
+++ b/vllm/engine/arg_utils.py
1039510395
@@ -210,6 +210,8 @@ class EngineArgs:
@@ -10420,7 +10420,7 @@ index 89c9b6747..a5be57ce0 100644
1042010420
parser.add_argument(
1042110421
"--disable-cascade-attn",
1042210422
action="store_true",
10423-
@@ -1061,6 +1075,8 @@ class EngineArgs:
10423+
@@ -1061,10 +1075,16 @@ class EngineArgs:
1042410424
override_generation_config=self.override_generation_config,
1042510425
enable_sleep_mode=self.enable_sleep_mode,
1042610426
model_impl=self.model_impl,
@@ -10429,7 +10429,26 @@ index 89c9b6747..a5be57ce0 100644
1042910429
)
1043010430

1043110431
def create_load_config(self) -> LoadConfig:
10432-
@@ -1504,12 +1520,13 @@ class EngineArgs:
10432+
10433+
+ use_low_bit_loader = False
10434+
+
10435+
+ if self.low_bit_model_path is not None:
10436+
+ use_low_bit_loader = True
10437+
if(self.qlora_adapter_name_or_path is not None) and \
10438+
self.quantization != "bitsandbytes":
10439+
raise ValueError(
10440+
@@ -1079,8 +1099,10 @@ class EngineArgs:
10441+
model_loader_extra_config=self.model_loader_extra_config,
10442+
ignore_patterns=self.ignore_patterns,
10443+
use_tqdm_on_load=self.use_tqdm_on_load,
10444+
+ use_low_bit_loader=use_low_bit_loader,
10445+
)
10446+
10447+
+
10448+
def create_speculative_config(
10449+
self,
10450+
target_model_config: ModelConfig,
10451+
@@ -1504,12 +1526,13 @@ class EngineArgs:
1043310452
_raise_or_fallback(feature_name=name, recommend_to_remove=True)
1043410453
return False
1043510454

@@ -12669,6 +12688,23 @@ index c190a4585..dda2a96cc 100644
1266912688
boi = self.boi.expand(x.shape[0], -1, -1)
1267012689
eoi = self.eoi.expand(x.shape[0], -1, -1)
1267112690
x = torch.cat((boi, x, eoi), dim=1)
12691+
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
12692+
index cb0379c10..5e8b22ab0 100644
12693+
--- a/vllm/model_executor/models/idefics2_vision_model.py
12694+
+++ b/vllm/model_executor/models/idefics2_vision_model.py
12695+
@@ -144,8 +144,10 @@ class Idefics2VisionAttention(nn.Module):
12696+
)
12697+
self.tp_size = get_tensor_model_parallel_world_size()
12698+
self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
12699+
- self.attn = MultiHeadAttention(self.num_heads_per_partition,
12700+
- self.head_dim, self.scale)
12701+
+ # self.attn = MultiHeadAttention(self.num_heads_per_partition,
12702+
+ # self.head_dim, self.scale)
12703+
+ from vllm.model_executor.models.siglip import SelfAttention
12704+
+ self.attn = SelfAttention(self.num_heads_per_partition, self.head_dim, self.scale)
12705+
12706+
def forward(
12707+
self,
1267212708
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
1267312709
index 5fab9df3f..f8e6fbe24 100644
1267412710
--- a/vllm/model_executor/models/minicpmv.py
@@ -13552,6 +13588,18 @@ index 000000000..d96085f46
1355213588
+ hidden_states=encoder_outputs.hidden_states,
1355313589
+ attentions=encoder_outputs.attentions,
1355413590
+ )
13591+
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
13592+
index db90848f9..5eabcf653 100644
13593+
--- a/vllm/model_executor/models/phi4mm_audio.py
13594+
+++ b/vllm/model_executor/models/phi4mm_audio.py
13595+
@@ -230,6 +230,7 @@ class ConformerEncoderLayer(nn.Module):
13596+
x = x + 0.5 * self.feed_forward_in(x)
13597+
norm_x = self.layer_norm_att(x)
13598+
13599+
+ mask = mask.to(x.device)
13600+
x = x + self.self_attn(
13601+
norm_x,
13602+
norm_x,
1355513603
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
1355613604
index c4d02e5dd..2831a5a12 100644
1355713605
--- a/vllm/model_executor/models/qwen2.py
@@ -13589,41 +13637,85 @@ index c4d02e5dd..2831a5a12 100644
1358913637
)
1359013638

1359113639
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
13592-
index 1e6ff1fec..e2480326a 100644
13640+
index 1e6ff1fec..90ebe5ca9 100644
1359313641
--- a/vllm/model_executor/models/qwen2_5_vl.py
1359413642
+++ b/vllm/model_executor/models/qwen2_5_vl.py
13595-
@@ -304,6 +304,10 @@ class Qwen2_5_VisionAttention(nn.Module):
13643+
@@ -302,23 +302,33 @@ class Qwen2_5_VisionAttention(nn.Module):
13644+
"(b s) ... -> b s ...",
13645+
b=batch_size)
1359613646
elif self.attn_backend == _Backend.TORCH_SDPA:
13597-
# Execute attention entry by entry for speed & less VRAM.
13598-
outputs = []
13599-
+ head_dim = q.shape[-1]
13600-
+ import math
13601-
+ import xe_addons
13602-
+ scale = 1 / math.sqrt(head_dim)
13603-
for i in range(1, len(cu_seqlens)):
13604-
start_idx = cu_seqlens[i - 1]
13605-
end_idx = cu_seqlens[i]
13606-
@@ -312,10 +316,16 @@ class Qwen2_5_VisionAttention(nn.Module):
13607-
v_i = v[:, start_idx:end_idx]
13608-
q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
13609-
for x in [q_i, k_i, v_i])
13647+
- # Execute attention entry by entry for speed & less VRAM.
13648+
- outputs = []
13649+
- for i in range(1, len(cu_seqlens)):
13650+
- start_idx = cu_seqlens[i - 1]
13651+
- end_idx = cu_seqlens[i]
13652+
- q_i = q[:, start_idx:end_idx]
13653+
- k_i = k[:, start_idx:end_idx]
13654+
- v_i = v[:, start_idx:end_idx]
13655+
- q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
13656+
- for x in [q_i, k_i, v_i])
1361013657
- output_i = F.scaled_dot_product_attention(q_i,
1361113658
- k_i,
1361213659
- v_i,
1361313660
- dropout_p=0.0)
13614-
+ # output_i = F.scaled_dot_product_attention(q_i,
13615-
+ # k_i,
13616-
+ # v_i,
13617-
+ # dropout_p=0.0)
13618-
+ output_i = xe_addons.sdp_non_causal(
13619-
+ q_i.contiguous(),
13620-
+ k_i.contiguous(),
13621-
+ v_i.contiguous(),
13622-
+ None,
13623-
+ scale)
13624-
output_i = rearrange(output_i, "b h s d -> b s h d ")
13625-
outputs.append(output_i)
13626-
context_layer = torch.cat(outputs, dim=1)
13661+
- output_i = rearrange(output_i, "b h s d -> b s h d ")
13662+
- outputs.append(output_i)
13663+
- context_layer = torch.cat(outputs, dim=1)
13664+
+ # TODO(xiangyu): Maybe add attn_backend xpu?
13665+
+ q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
13666+
+ from vllm._ipex_ops import ipex_ops
13667+
+ output = torch.empty(
13668+
+ (q.shape[0], q.shape[1], q.shape[2]),
13669+
+ dtype=q.dtype,
13670+
+ device=q.device)
13671+
+ import math
13672+
+ head_dim = q.shape[-1]
13673+
+ scale = 1 / math.sqrt(head_dim)
13674+
+ ipex_ops.varlen_attention(q, k, v, output,
13675+
+ cu_seqlens,
13676+
+ cu_seqlens,
13677+
+ max_seqlen,
13678+
+ max_seqlen,
13679+
+ pdropout=0,
13680+
+ softmax_scale=scale,
13681+
+ zero_tensors=False,
13682+
+ is_causal=False,
13683+
+ return_softmax=False,
13684+
+ gen_=None,
13685+
+ logits_soft_cap=0
13686+
+ )
13687+
+
13688+
+ context_layer = rearrange(output,
13689+
+ "(b s) ... -> b s ...",
13690+
+ b=batch_size)
13691+
elif self.attn_backend == _Backend.XFORMERS:
13692+
from xformers import ops as xops
13693+
from xformers.ops.fmha.attn_bias import BlockDiagonalMask
13694+
@@ -613,10 +623,11 @@ class Qwen2_5_VisionTransformer(nn.Module):
13695+
cu_seqlens: torch.Tensor,
13696+
) -> tuple[Optional[int], Optional[list[int]]]:
13697+
max_seqlen, seqlens = None, None
13698+
- if self.attn_backend == _Backend.FLASH_ATTN:
13699+
- max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
13700+
- elif self.attn_backend == _Backend.XFORMERS:
13701+
- seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
13702+
+ # if self.attn_backend == _Backend.FLASH_ATTN:
13703+
+ # max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
13704+
+ # elif self.attn_backend == _Backend.XFORMERS:
13705+
+ # seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
13706+
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
13707+
return max_seqlen, seqlens
13708+
13709+
def forward(
13710+
@@ -1082,7 +1093,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
13711+
image_input=image_input,
13712+
video_input=video_input)
13713+
input_ids = None
13714+
-
13715+
+
13716+
hidden_states = self.language_model.model(
13717+
input_ids=input_ids,
13718+
positions=positions,
1362713719
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
1362813720
index a7800d415..26af87512 100644
1362913721
--- a/vllm/model_executor/models/qwen2_vl.py
@@ -15133,10 +15225,10 @@ index c271f438e..cf7180606 100755
1513315225
assert sliding_window == (-1, -1), (
1513415226
diff --git a/vllm/v1/attention/backends/ipex_attn.py b/vllm/v1/attention/backends/ipex_attn.py
1513515227
new file mode 100644
15136-
index 000000000..f4a435eaa
15228+
index 000000000..964696cfe
1513715229
--- /dev/null
1513815230
+++ b/vllm/v1/attention/backends/ipex_attn.py
15139-
@@ -0,0 +1,392 @@
15231+
@@ -0,0 +1,404 @@
1514015232
+from dataclasses import dataclass
1514115233
+from typing import Any, Dict, List, Optional, Tuple, Type
1514215234
+
@@ -15152,6 +15244,10 @@ index 000000000..f4a435eaa
1515215244
+from vllm.attention.backends.ipex_attn import use_gqa_kernel
1515315245
+from vllm.utils import is_bmg_platform
1515415246
+import os
15247+
+from vllm.logger import init_logger
15248+
+
15249+
+logger = init_logger(__name__)
15250+
+
1515515251
+
1515615252
+@dataclass
1515715253
+class IPEXAttentionMetadata(FlashAttentionMetadata):
@@ -15246,6 +15342,12 @@ index 000000000..f4a435eaa
1524615342
+ "are not implemented for "
1524715343
+ "IpexAttnBackendImpl")
1524815344
+
15345+
+ flag = os.getenv("IPEX_LLM_PREFILL_VARLEN_BACKEND", None)
15346+
+ self.ipex_varlen_attn = False
15347+
+ if flag is not None:
15348+
+ self.ipex_varlen_attn = True
15349+
+ logger.info_once(f"V1 engine using varlen_attention for prefilling.")
15350+
+
1524915351
+ def forward(
1525015352
+ self,
1525115353
+ layer: AttentionLayer,
@@ -15293,6 +15395,7 @@ index 000000000..f4a435eaa
1529315395
+ self.sliding_window,
1529415396
+ self.alibi_slopes,
1529515397
+ self.logits_soft_cap,
15398+
+ self.ipex_varlen_attn,
1529615399
+ )
1529715400
+ return output.view(-1, self.num_heads * self.head_size)
1529815401
+
@@ -15367,6 +15470,7 @@ index 000000000..f4a435eaa
1536715470
+ sliding_window: Optional[List[int]] = None,
1536815471
+ alibi_slopes: Optional[torch.Tensor] = None,
1536915472
+ logits_soft_cap: Optional[float] = None,
15473+
+ flag: Optional[bool] = False,
1537015474
+) -> None:
1537115475
+ context = get_forward_context()
1537215476
+ current_metadata = context.attn_metadata
@@ -15382,7 +15486,7 @@ index 000000000..f4a435eaa
1538215486
+ key = key.view(-1, num_kv_heads, head_size)
1538315487
+ value = value.view(-1, num_kv_heads, head_size)
1538415488
+
15385-
+ if is_bmg_platform:
15489+
+ if flag or is_bmg_platform:
1538615490
+ key_cache, value_cache = kv_cache.unbind(0)
1538715491
+ ipex_ops.reshape_and_cache_flash(
1538815492
+ key[:num_actual_tokens],
@@ -17087,7 +17191,7 @@ index 000000000..dffc7b367
1708717191
+ return (attn_metadata, encoder_input_tokens_tensor,
1708817192
+ encoder_input_positions_tensor)
1708917193
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
17090-
index 9d49b4385..78e0c54f2 100644
17194+
index 9d49b4385..dc5e95f4e 100644
1709117195
--- a/vllm/worker/xpu_model_runner.py
1709217196
+++ b/vllm/worker/xpu_model_runner.py
1709317197
@@ -5,8 +5,8 @@ import time
@@ -17735,15 +17839,17 @@ index 9d49b4385..78e0c54f2 100644
1773517839
max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
1773617840
self.model_config)
1773717841
if max_mm_tokens > 0:
17738-
@@ -461,6 +820,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17842+
@@ -461,6 +820,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1773917843
"Computed max_num_seqs (%s) to be less than 1. "
1774017844
"Setting it to the minimum value of 1.", expr)
1774117845
max_num_seqs = 1
1774217846
+ '''
17847+
+ if "phi4mm" in self.model_config.hf_config.model_type:
17848+
+ max_num_seqs = 1
1774317849

1774417850
batch_size = 0
1774517851
for group_id in range(max_num_seqs):
17746-
@@ -479,11 +839,14 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17852+
@@ -479,11 +841,14 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1774717853
seq_data={group_id: dummy_data.seq_data},
1774817854
sampling_params=sampling_params,
1774917855
block_tables=None,
@@ -17759,7 +17865,7 @@ index 9d49b4385..78e0c54f2 100644
1775917865
finished_requests_ids = [seq.request_id for seq in seqs]
1776017866
model_input = self.prepare_model_input(
1776117867
seqs, finished_requests_ids=finished_requests_ids)
17762-
@@ -493,25 +856,39 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17868+
@@ -493,25 +858,39 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1776317869
batch_size=batch_size,
1776417870
dtype=self.model_config.dtype,
1776517871
device=self.device)
@@ -17810,7 +17916,7 @@ index 9d49b4385..78e0c54f2 100644
1781017916
"""Helper method to prepare the model input based on a given sequence
1781117917
group. Prepares metadata needed for the base model forward pass but not
1781217918
metadata for possible additional steps, e.g., sampling.
17813-
@@ -524,6 +901,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17919+
@@ -524,6 +903,22 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1781417920

1781517921
return builder.build() # type: ignore
1781617922

@@ -17833,7 +17939,7 @@ index 9d49b4385..78e0c54f2 100644
1783317939
def prepare_model_input(
1783417940
self,
1783517941
seq_group_metadata_list: List[SequenceGroupMetadata],
17836-
@@ -563,6 +956,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17942+
@@ -563,6 +958,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1783717943
raise ValueError(
1783817944
"XPUModelRunner does not support multi-step execution.")
1783917945

@@ -17846,7 +17952,7 @@ index 9d49b4385..78e0c54f2 100644
1784617952
model_executable = self.model
1784717953
if (self.observability_config is not None
1784817954
and self.observability_config.collect_model_forward_time):
17849-
@@ -612,3 +1011,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
17955+
@@ -612,3 +1013,9 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
1785017956
output.model_forward_time = model_forward_time
1785117957

1785217958
return [output]

0 commit comments

Comments
 (0)