[V0 deprecation] Deprecate use_v1 parameter

wangxiyuan · wangxiyuan · commit b009b120163e · 2025-11-05T16:15:00.000+08:00
Signed-off-by: wangxiyuan &lt;wangxiyuan1007@gmail.com&gt;
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -27,7 +27,6 @@ def get_attn_backend_cls(
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
         use_mla,
         has_sink,
         use_sparse,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
@@ -187,17 +187,33 @@ def _cached_get_attn_backend(
     # get device-specific attn_backend
     from vllm.platforms import current_platform
 
-    attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend,
-        head_size,
-        dtype,
-        kv_cache_dtype,
-        block_size,
-        True,
-        use_mla,
-        has_sink,
-        use_sparse,
-    )
+    try:
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            use_mla,
+            has_sink,
+            use_sparse,
+        )
+    except TypeError:
+        logger.warning_once(
+            "use_v1 parameter for get_attn_backend_cls is deprecated and will "
+            "be removed in the future. Please remove it from your plugin code."
+        )
+        attention_cls = current_platform.get_attn_backend_cls(
+            selected_backend,
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            True,
+            use_mla,
+            has_sink,
+            use_sparse,
+        )
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}"
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
@@ -131,7 +131,6 @@ def get_attn_backend_cls(
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
@@ -145,8 +144,6 @@ def get_attn_backend_cls(
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on CPU.")
         logger.info("Using Torch SDPA backend.")
-        if not use_v1:
-            raise ValueError("CPU backend only supports V1.")
         return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
 
     @classmethod
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -253,7 +253,6 @@ def get_attn_backend_cls(
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
         use_mla,
         has_sink,
         use_sparse,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -187,7 +187,6 @@ def get_attn_backend_cls(
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -222,7 +222,6 @@ def get_attn_backend_cls(
         dtype,
         kv_cache_dtype,
         block_size,
-        use_v1,
         use_mla,
         has_sink,
         use_sparse,
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
@@ -59,7 +59,6 @@ def get_attn_backend_cls(
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink,
         use_sparse,
@@ -71,8 +70,6 @@ def get_attn_backend_cls(
         if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
 
-        if not use_v1:
-            raise ValueError("TPU backend only supports V1.")
         logger.info("Using Pallas V1 backend.")
         return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
@@ -49,7 +49,6 @@ def get_attn_backend_cls(
         dtype: torch.dtype,
         kv_cache_dtype: str | None,
         block_size: int,
-        use_v1: bool,
         use_mla: bool,
         has_sink: bool,
         use_sparse,
@@ -77,7 +76,7 @@ def get_attn_backend_cls(
         elif selected_backend:
             raise ValueError(
                 f"Invalid attention backend for {cls.device_name}, "
-                f"with use_v1: {use_v1} use_mla: {use_mla}"
+                f"with use_mla: {use_mla}"
             )
 
         logger.info("Using Flash Attention backend.")