mlcommons · wangshangsam · Dec 10, 2025 · Dec 10, 2025 · Dec 11, 2025 · Dec 11, 2025
@@ -312,6 +312,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::server_num_issue_query_threads)
       .def_readwrite("offline_expected_qps",
                      &TestSettings::offline_expected_qps)
+      .def_readwrite("sample_concatenate_permutation",
+                     &TestSettings::sample_concatenate_permutation)
       .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
       .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms)
       .def_readwrite("min_query_count", &TestSettings::min_query_count)
@@ -324,6 +326,14 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::accuracy_log_rng_seed)
       .def_readwrite("accuracy_log_probability",
                      &TestSettings::accuracy_log_probability)
+      .def_readwrite("accuracy_log_sampling_target",
+                     &TestSettings::accuracy_log_sampling_target)
+      .def_readwrite("test05", &TestSettings::test05)
+      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
+      .def_readwrite("test05_sample_index_rng_seed",
+                     &TestSettings::test05_sample_index_rng_seed)
+      .def_readwrite("test05_schedule_rng_seed",
+                     &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("print_timestamps", &TestSettings::print_timestamps)
       .def_readwrite("performance_issue_unique",
                      &TestSettings::performance_issue_unique)
@@ -333,12 +343,6 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::performance_issue_same_index)
       .def_readwrite("performance_sample_count_override",
                      &TestSettings::performance_sample_count_override)
-      .def_readwrite("test05", &TestSettings::test05)
-      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
-      .def_readwrite("test05_sample_index_rng_seed",
-                     &TestSettings::test05_sample_index_rng_seed)
-      .def_readwrite("test05_schedule_rng_seed",
-                     &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
       .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
       .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)

@@ -26,6 +26,7 @@ rgat.*.performance_sample_count_override = 788379
 pointpainting.*.performance_sample_count_override = 1024
 deepseek-r1.*.performance_sample_count_override = 4388
 whisper.*.performance_sample_count_override = 1633
+qwen3-vl-235b-a22b.*.performance_sample_count_override = 48289
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
 
@@ -67,7 +68,7 @@ llama3_1-8b-edge.*.sample_concatenate_permutation = 1
 llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
 deepseek-r1.*.sample_concatenate_permutation = 1
 whisper.*.sample_concatenate_permutation = 1
-
+qwen3-vl-235b-a22b.*.sample_concatenate_permutation = 1
 *.Server.target_latency = 10
 *.Server.target_latency_percentile = 99
 *.Server.target_duration = 0
@@ -91,7 +92,9 @@ llama3_1-8b-edge.*.use_token_latencies = 1
 llama3_1-8b-interactive.*.use_token_latencies = 1
 deepseek-r1.*.use_token_latencies = 1
 whisper.*.use_token_latencies = 1
-
+# For the VLM benchmark, the model response is relatively short, therefore we track 
+# end-to-end latency instead of token latencies.
+qwen3-vl-235b-a22b.*.use_token_latencies = 0
 # gptj benchmark infers token latencies
 gptj.*.infer_token_latencies = 1
 gptj.*.token_latency_scaling_factor = 69
@@ -132,6 +135,8 @@ deepseek-r1.Server.target_latency = 0
 deepseek-r1.Server.ttft_latency = 2000
 deepseek-r1.Server.tpot_latency = 80
 
+qwen3-vl-235b-a22b.Server.target_latency = 12000
+
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
 
@@ -156,6 +161,7 @@ mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 deepseek-r1.Offline.min_query_count = 4388
 whisper.Offline.min_query_count = 1633
+qwen3-vl-235b-a22b.Offline.min_query_count = 48289
 
 # These fields should be defined and overridden by user.conf.
 *.SingleStream.target_latency = 10

@@ -234,10 +234,6 @@ struct TestSettings {
   uint64_t test05_qsl_rng_seed = 0;
   uint64_t test05_sample_index_rng_seed = 0;
   uint64_t test05_schedule_rng_seed = 0;
-
-  /// \brief Load mlperf parameter config from file.
-  int FromConfig(const std::string &path, const std::string &model,
-                 const std::string &scenario, int conf_type = 1);
   /**@}*/
 
   // ==================================
@@ -272,6 +268,10 @@ struct TestSettings {
   bool infer_token_latencies = false;
   uint64_t token_latency_scaling_factor;
   /**@}*/
+
+  /// \brief Load mlperf parameter config from file.
+  int FromConfig(const std::string &path, const std::string &model,
+                 const std::string &scenario, int conf_type = 1);
 };
 
 ///

@@ -0,0 +1,272 @@
+# Reference Implementation for the Qwen3-VL (Q3VL) Benchmark 
+
+## Quick Start
+
+This guide demonstrates how you can run the benchmark on your local machine.
+
+### Create a Conda environment
+
+Follow [this link](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions)
+on how to install Miniconda on your host machine. Then, you can create a new conda 
+environment via:
+
+```bash
+conda create -n mlperf-inf-mm-q3vl python=3.12
+```
+
+### Install the Q3VL benchmarking CLI
+
+#### For users
+
+Install `mlperf-inf-mm-q3vl` with:
+
+```bash
+pip install git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/
+```
+
+#### For developers
+
+Clone the MLPerf Inference repo via:
+
+```bash
+git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf-inference
+```
+
+Then enter the repo: 
+
+```bash
+cd mlperf-inference/
+```
+
+Install `mlperf-inf-mm-q3vl` and the development tools with:
+
+- On Bash
+```bash
+pip install -e multimodal/qwen3-vl/[dev]
+```
+- On Zsh
+```zsh
+pip install -e multimodal/qwen3-vl/"[dev]"
+```
+
+### Post Q3VL benchmarking CLI installation 
+
+After installation, you can check the CLI flags that `mlperf-inf-mm-q3vl` can take with:
+
+```bash
+mlperf-inf-mm-q3vl --help
+```
+
+You can enable shell autocompletion for `mlperf-inf-mm-q3vl` with:
+
+```bash
+mlperf-inf-mm-q3vl --install-completion
+```
+
+> [!NOTE]
+> Shell auto-completion will take effect once you restart the terminal.
+
+### Start an inference endpoint on your local host machine with vLLM
+
+Please refer to [this guide on how to launch vLLM for various Qwen3 VL MoE models](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3-VL.html).
+
+```bash
+docker run --gpus all \                                 # Use all the GPUs on this host machine.
+    -v ~/.cache/huggingface:/root/.cache/huggingface \  # Use the HuggingFace cache from your host machine.
+    -p 8000:8000 \                                      # This assumes the endpoint will use port 8000.
+    --ipc=host \                                        # The container can access and utilize the host's IPC mechanisms (e.g., shared memory).
+    vllm/vllm-openai:nightly \                          # You can also use the `:latest` container or a specific release.
+        --model Qwen/Qwen3-VL-235B-A22B-Instruct \      # Specifies the model for vLLM to deploy.
+        --tensor-parallel-size 8 \                      # 8-way tensor-parallel inference across 8 GPUs.
+        --limit-mm-per-prompt.video 0                   # The input requests will contain images only (i.e., no videos).
+```
+
+### Run the benchmark for the Offline scenario
+
+Performance only mode:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario offline --settings.test.mode performance_only
+```
+
+Accuracy only mode:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario offline --settings.test.mode accuracy_only
+```
+
+### Run the benchmark for the Server scenario
+
+Performance only mode:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario server --settings.test.mode performance_only
+```
+
+Accuracy only mode:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario server --settings.test.mode accuracy_only
+```
+
+### Pass in `user.conf`
+
+You can pass in a `user.conf` file through `--settings.user_conf.path`, such that the
+LoadGen parameters provided through the CLI will be overridden by the `user.conf` 
+provided by you and the `mlperf.conf` inside the LoadGen. An example `user.conf` file
+is included: [example_user.conf](./example_user.conf). As such, you can run the
+benchmark with `user.conf` via:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint \
+  --settings.test.scenario <scenario> \
+  --settings.test.mode <mode> \
+  --settings.user_conf.path example_user.conf
+```
+
+### Evalute the response quality
+
+You should pass the `mlperf_log_accuracy.json` file (generated by LoadGen) to the
+`--filename` flag of the `mlperf-inf-mm-q3vl evaluate` command.
+
+```bash
+mlperf-inf-mm-q3vl evaluate --filename output/mlperf_log_accuracy.json
+```
+
+## Docker
+
+[docker/](docker/) provides examples of Dockerfiles that install the Q3VL benchmarking
+CLI into the container images of the inference engine. This is useful when you have to
+run both the inference engine and the Q3VL benchmarking CLI inside the same container,
+for example, in a situation where you must use a GPU cluster managed by 
+[Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and
+[pyxis](https://github.com/NVIDIA/pyxis).
+
+As an illustrative example, assuming that you are at the root directory of the MLPerf 
+Inference repo:
+
+1. You can build a container image against the vLLM's
+`vllm/vllm-openai:v0.12.0` release by
+
+```bash
+docker build \
+    --build-arg BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0 \
+    --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=multimodal/qwen3-vl \
+    -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \
+    -t mlperf-inf-mm-q3vl:vllm-openai-v0.12.0 \
+    .
+```
+> [!NOTE]
+> `MLPERF_INF_MM_Q3VL_INSTALL_URL` can also take in a remote GitHub location, such as
+> `git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/`.
+
+2. Afterwards, you can start the container in the interactive mode by
+
+```bash
+docker run --rm -it --gpus all -v ~/.cache:/root/.cache --ipc=host mlperf-inf-mm-q3vl:vllm-openai-v0.12.0
+```
+
+### Benchmark against vLLM inside the container
+
+If you are running `mlperf-inf-mm-q3vl` inside a local environment that has access to
+vLLM (such as inside a container that was created using the 
+[docker/vllm-cuda.Dockerfile](docker/vllm-cuda.Dockerfile)), you can use a single
+`mlperf-inf-mm-q3vl benchmark vllm` command to achieve:
+
+1. Deploy an endpoint using vLLM.
+2. Wait for the endpoint to be healthy.
+3. Run the benchmark against that endpoint.
+
+For example, inside the container, you can run the Offline scenario Accuracy only
+mode with:
+
+```bash
+mlperf-inf-mm-q3vl benchmark vllm \
+    --settings.test.scenario offline \
+    --settings.test.mode accuracy_only \
+    --settings.user_conf.path example_user.conf \
+    --vllm.cli=--async-scheduling \
+    --vllm.cli=--max-model-len=32768 \
+    --vllm.cli=--max-num-seqs=1024 \
+    --vllm.cli=--compilation-config='{
+        "cudagraph_capture_sizes": [
+            1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128,
+            136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248,
+            256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480,
+            496, 512, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768
+        ]
+    }' \
+    --vllm.cli=--limit-mm-per-prompt.video=0 \
+    --vllm.cli=--tensor-parallel-size=8 
+```
+
+## Slurm
+
+[scripts/slurm/](scripts/slurm/) provide example scripts of running both the benchmark 
+and the response quality evaluation in a GPU cluster managed by 
+[Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and
+[pyxis](https://github.com/NVIDIA/pyxis). Specifically,
+
+- [scripts/slurm/benchmark.sh](scripts/slurm/benchmark.sh) is a sbatch script that 
+  runs the benchmarking job.
+- [scripts/slurm/evaluate.sh](scripts/slurm/evaluate.sh) is a sbatch script that runs
+  the evaluation job.
+- [scripts/slurm/submit.sh](scripts/slurm/submit.sh) is a Bash script that submits both
+  jobs, where the evaluation job would only run if the benchmarking job has succeeded.
+
+You can check the CLI flags that [scripts/slurm/submit.sh](scripts/slurm/submit.sh) can
+take via:
+
+```bash
+bash submit.sh --help
+```
+
+> [!NOTE]
+> Slurm clusters are often highly customized per organization. If you are unfamiliar
+> with Slurm, you should check with the cluster administrator of your organization
+> first, get a good understanding of what those example scripts do, and adapt the 
+> example scripts to the specific settings for the Slurm cluster that you are going
+> to use, before you try to launch any jobs.
+
+## Reference Implementation Specification
+
+- v6.0 Round
+  - vLLM version: [v0.12.0](https://github.com/vllm-project/vllm/releases/tag/v0.12.0)
+  - Model:
+    - [Qwen/Qwen3-VL-235B-A22B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct)
+    - Commit SHA: [710c13861be6c466e66de3f484069440b8f31389](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct/tree/710c13861be6c466e66de3f484069440b8f31389)
+  - Dataset:
+    - [Shopify/product-catalogue](https://huggingface.co/datasets/Shopify/product-catalogue)
+    - Commit SHA: [d5c517c509f5aca99053897ef1de797d6d7e5aa5](https://huggingface.co/datasets/Shopify/product-catalogue/tree/d5c517c509f5aca99053897ef1de797d6d7e5aa5)
+    - Both the `train` and the `test` splits are used and concatenated in that order.
+    - Total number of samples: `48289`.
+  - Guided decoding is not used.
+  - Constraints:
+    - Model quality:
+      - Category Hierarchical F1 Score >= `0.7824`. This is the 99% recovery of 
+        `0.7903037` which is the mean category hierarchical F1 score across 10 runs on 
+        [the BF16 version of the model](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct).
+        The standard deviation across those 10 runs is `0.0002250412555`.
+    - Server Scenario:
+      - Target latency is used as the constraint, instead of Time to First Token (TTFT)
+        or Time per Output Token (TPOT) latencies. 
+      - Target latency percentile = `0.99`.
+      - Target latency $\le$ 12 seconds.
+    - Offline Scenario:
+      - Number of samples in the query $\ge$ `48289` (i.e., every sample in the entire
+        dataset would be send to the VLM endpoint at least once).
+    - Performance sample count: `48289` (i.e., the entire dataset will be loaded into
+      the host memory, which takes ~6.39 GB). 
+    - Testing duration $\ge$ 10 mins.
+    - Sample concatenation permutation is enabled.
+
+
+## Developer Guide
+
+### Linting
+
+You can lint the Q3VL benchmark source code by running the following script:
+
+```bash
+bash multimodal/qwen3-vl/scripts/linters.sh
+```