vllm-project
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 26 additions & 1 deletion b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/quantization/fp4/nvfp4_quant_kernels.cu‎
Lines changed: 20 additions & 2 deletions b/‎csrc/quantization/fp4/nvfp4_quant_kernels.cu‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎docs/README.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/usage/troubleshooting.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/usage/troubleshooting.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/compile/test_fusions_e2e.py‎
Lines changed: 5 additions & 7 deletions b/‎tests/compile/test_fusions_e2e.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎tests/kernels/quantization/test_nvfp4_quant.py‎
Lines changed: 0 additions & 2 deletions b/‎tests/kernels/quantization/test_nvfp4_quant.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/samplers/test_logprobs.py‎
Lines changed: 96 additions & 0 deletions b/‎tests/samplers/test_logprobs.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎tests/samplers/test_ranks.py‎
Lines changed: 0 additions & 59 deletions b/‎tests/samplers/test_ranks.py‎
Lines changed: 0 additions & 59 deletions
@@ -473,7 +473,9 @@ steps:
   - tests/compile
   commands:
   - pytest -v -s compile/test_full_graph.py
-  - pytest -v -s compile/test_fusions_e2e.py
+    # Limit to no custom ops to reduce running time 
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -930,6 +932,29 @@ steps:
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml 
+    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusions_e2e.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
     - pytest -v -s tests/compile/test_fusions_e2e.py
 
 - label: Blackwell GPT-OSS Eval
 
@@ -84,7 +84,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support
 
 
@@ -31,6 +31,13 @@
 
 namespace vllm {
 
+template <typename Int>
+__host__ __device__ inline Int round_up(Int x, Int y) {
+  static_assert(std::is_integral_v<Int>,
+                "round_up argument must be integral type");
+  return (x + y - 1) / y * y;
+}
+
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
 __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
@@ -42,10 +49,21 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
   static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
                 "Vec size is not matched.");
 
+  int sf_m = round_up<int>(numRows, 128);
+  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
+  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
+  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
+    // Each thread writes 4 uint32_t elements.
+    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
+         col += blockDim.x * 4) {
+      SFout[row * sf_n_int + col] = 0x00;
+    }
+  }
+
   // Get the global scaling factor, which will be applied to the SF.
   // Note SFScale is the same as next GEMM's alpha, which is
   // (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
 
   // Input tensor row/col loops.
   for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
@@ -64,7 +82,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
               rowIdx, colIdx, numCols, SFout);
 
       out_pos =
-          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
     }
   }
 }
 
@@ -56,7 +56,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support
 
 
@@ -316,6 +316,10 @@ Traceback (most recent call last):
 
 This indicates vLLM failed to initialize the NCCL communicator, possibly due to a missing `IPC_LOCK` linux capability  or an unmounted `/dev/shm`. Refer to [Enabling GPUDirect RDMA](../serving/parallelism_scaling.md#enabling-gpudirect-rdma) for guidance on properly configuring the environment for GPUDirect RDMA.
 
+## CUDA error: the provided PTX was compiled with an unsupported toolchain
+
+If you see an error like `RuntimeError: CUDA error: the provided PTX was compiled with an unsupported toolchain.`, it means that the CUDA PTX in vLLM's wheels was compiled with a toolchain unsupported by your system. The released vLLM wheels have to be compiled with a specific version of CUDA toolkit, and the compiled code might fail to run on lower versions of CUDA drivers. Read [cuda compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/) for more details. The solution is to install `cuda-compat` package from your package manager. For example, on Ubuntu, you can run `sudo apt-get install cuda-compat-12-9`, and then add `export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH` to your `.bashrc` file. When successfully installed, you should see that the output of `nvidia-smi` will show `CUDA Version: 12.9`. Note that we use CUDA 12.9 as an example here, you may want to install a higher version of cuda-compat package in case vLLM's default CUDA version goes higher.
+
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759).
 
@@ -54,11 +54,11 @@ class ModelBackendTestCase(NamedTuple):
 
     MODELS_FP4 = [
         ModelBackendTestCase(
-            model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+            model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             backend=_Backend.FLASHINFER,
-            attention_fusions=48,
-            allreduce_fusions=96,
+            attention_fusions=32,
+            allreduce_fusions=65,
         ),
     ]
 
@@ -95,8 +95,7 @@ class ModelBackendTestCase(NamedTuple):
         ),
     ]
 
-# TODO(luka) test both in nightly
-CUSTOM_OPS_FP8 = ["-quant_fp8"]  # , "+quant_fp8"]
+CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
 
 
 @pytest.mark.parametrize(
@@ -171,8 +170,7 @@ def test_attn_quant(
     assert int(matches[0]) == attention_fusions
 
 
-# TODO(luka) test both in nightly
-CUSTOM_OPS_RMS_NORM = ["-rms_norm"]  # , "+rms_norm"]
+CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
 
 
 def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
 
@@ -168,9 +168,7 @@ def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
     out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
 
     out, out_scale = ops.scaled_fp4_quant(x, global_scale)
-
     scale_ans = recover_swizzled_scales(out_scale, m, n)
     out_ans = cast_from_fp4(out, m, n)
-
     torch.testing.assert_close(out_ans, out_ref)
     torch.testing.assert_close(scale_ans, scale_ref)
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.logprobs import FlattenLogprobs
+
+MODELS = ["distilbert/distilgpt2"]
+MAX_TOKENS = 5
+NUM_TOP_LOGPROBS = 5
+NUM_PROMPT_LOGPROBS = 7
+MAX_LOGPROBS = max(NUM_TOP_LOGPROBS, NUM_PROMPT_LOGPROBS)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("greedy", [True, False])
+@pytest.mark.parametrize("flatten_logprobs", [True, False])
+def test_ranks(
+    vllm_runner,
+    model,
+    dtype,
+    greedy,
+    flatten_logprobs,
+    example_prompts,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "1" if flatten_logprobs else "0")
+    with vllm_runner(model, dtype=dtype, max_logprobs=MAX_LOGPROBS) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+        example_prompt_tokens = [tokenizer.encode(prompt) for prompt in example_prompts]
+        sampling_params = SamplingParams(
+            temperature=0.0 if greedy else 1.0,
+            top_p=1.0,
+            max_tokens=MAX_TOKENS,
+            logprobs=NUM_TOP_LOGPROBS,
+            prompt_logprobs=NUM_PROMPT_LOGPROBS,
+        )
+        results = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+
+    assert len(results) == len(example_prompt_tokens)
+    for i, (result, prompt_tokens) in enumerate(zip(results, example_prompt_tokens)):
+        decode_tokens, _, decode_logprobs, prompt_logprobs = result
+
+        # Ensure the return type of logprobs is accurate
+        assert isinstance(
+            prompt_logprobs, FlattenLogprobs if flatten_logprobs else list
+        )
+        assert isinstance(
+            decode_logprobs, FlattenLogprobs if flatten_logprobs else list
+        )
+
+        ########################
+        # Check prompt logprobs
+        ########################
+        assert len(prompt_tokens) == len(prompt_logprobs)
+        # No logprob for first prompt token
+        assert not prompt_logprobs[0]
+        for position, (token, logprobs) in enumerate(
+            zip(prompt_tokens[1:], prompt_logprobs[1:]), start=1
+        ):
+            # Ensure logprobs of prompt token is always returned
+            logprob = logprobs.get(token)
+            assert logprob is not None
+            assert logprob.rank >= 1
+            # Ensure # of returned logprobs should be
+            # either NUM_PROMPT_LOGPROBS or NUM_PROMPT_LOGPROBS+1
+            assert NUM_PROMPT_LOGPROBS <= len(logprobs) <= NUM_PROMPT_LOGPROBS + 1
+            # Ensure top NUM_PROMPT_LOGPROBS is always extracted
+            assert set(range(1, NUM_PROMPT_LOGPROBS + 1)).issubset(
+                {logprob.rank for logprob in logprobs.values()}
+            )
+
+        ########################
+        # Check sample logprobs
+        ########################
+        assert len(decode_tokens) == len(decode_logprobs)
+        for position, (token, logprobs) in enumerate(
+            zip(decode_tokens, decode_logprobs)
+        ):
+            # Ensure logprobs of chosen token is always returned
+            logprob = logprobs.get(token)
+            assert logprob is not None
+            if greedy:
+                # For greedy sampling, all chosen logprob should be top ranked
+                assert logprob.rank == 1
+            else:
+                assert logprob.rank >= 1
+            # Ensure # of returned logprobs should be
+            # either NUM_TOP_LOGPROBS or NUM_TOP_LOGPROBS+1
+            assert NUM_TOP_LOGPROBS <= len(logprobs) <= NUM_TOP_LOGPROBS + 1
+            # Ensure top NUM_TOP_LOGPROBS logprobs is always extracted
+            assert set(range(1, NUM_TOP_LOGPROBS + 1)).issubset(
+                {logprob.rank for logprob in logprobs.values()}
+            )