diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc
index 96396dab92..4e72f542ed 100644
--- a/loadgen/bindings/python_api.cc
+++ b/loadgen/bindings/python_api.cc
@@ -312,6 +312,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::server_num_issue_query_threads)
       .def_readwrite("offline_expected_qps",
                      &TestSettings::offline_expected_qps)
+      .def_readwrite("sample_concatenate_permutation",
+                     &TestSettings::sample_concatenate_permutation)
       .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
       .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms)
       .def_readwrite("min_query_count", &TestSettings::min_query_count)
@@ -324,6 +326,14 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::accuracy_log_rng_seed)
       .def_readwrite("accuracy_log_probability",
                      &TestSettings::accuracy_log_probability)
+      .def_readwrite("accuracy_log_sampling_target",
+                     &TestSettings::accuracy_log_sampling_target)
+      .def_readwrite("test05", &TestSettings::test05)
+      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
+      .def_readwrite("test05_sample_index_rng_seed",
+                     &TestSettings::test05_sample_index_rng_seed)
+      .def_readwrite("test05_schedule_rng_seed",
+                     &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("print_timestamps", &TestSettings::print_timestamps)
       .def_readwrite("performance_issue_unique",
                      &TestSettings::performance_issue_unique)
@@ -333,12 +343,6 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::performance_issue_same_index)
       .def_readwrite("performance_sample_count_override",
                      &TestSettings::performance_sample_count_override)
-      .def_readwrite("test05", &TestSettings::test05)
-      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
-      .def_readwrite("test05_sample_index_rng_seed",
-                     &TestSettings::test05_sample_index_rng_seed)
-      .def_readwrite("test05_schedule_rng_seed",
-                     &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
       .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
       .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 1b825514bd..d21a73a47d 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -26,6 +26,7 @@ rgat.*.performance_sample_count_override = 788379
 pointpainting.*.performance_sample_count_override = 1024
 deepseek-r1.*.performance_sample_count_override = 4388
 whisper.*.performance_sample_count_override = 1633
+qwen3-vl-235b-a22b.*.performance_sample_count_override = 48289
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
 
@@ -67,7 +68,7 @@ llama3_1-8b-edge.*.sample_concatenate_permutation = 1
 llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
 deepseek-r1.*.sample_concatenate_permutation = 1
 whisper.*.sample_concatenate_permutation = 1
-
+qwen3-vl-235b-a22b.*.sample_concatenate_permutation = 1
 *.Server.target_latency = 10
 *.Server.target_latency_percentile = 99
 *.Server.target_duration = 0
@@ -91,7 +92,9 @@ llama3_1-8b-edge.*.use_token_latencies = 1
 llama3_1-8b-interactive.*.use_token_latencies = 1
 deepseek-r1.*.use_token_latencies = 1
 whisper.*.use_token_latencies = 1
-
+# For the VLM benchmark, the model response is relatively short, therefore we track 
+# end-to-end latency instead of token latencies.
+qwen3-vl-235b-a22b.*.use_token_latencies = 0
 # gptj benchmark infers token latencies
 gptj.*.infer_token_latencies = 1
 gptj.*.token_latency_scaling_factor = 69
@@ -132,6 +135,8 @@ deepseek-r1.Server.target_latency = 0
 deepseek-r1.Server.ttft_latency = 2000
 deepseek-r1.Server.tpot_latency = 80
 
+qwen3-vl-235b-a22b.Server.target_latency = 12000
+
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
 
@@ -156,6 +161,7 @@ mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 deepseek-r1.Offline.min_query_count = 4388
 whisper.Offline.min_query_count = 1633
+qwen3-vl-235b-a22b.Offline.min_query_count = 48289
 
 # These fields should be defined and overridden by user.conf.
 *.SingleStream.target_latency = 10
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index 584d073bb8..2e092e721d 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -234,10 +234,6 @@ struct TestSettings {
   uint64_t test05_qsl_rng_seed = 0;
   uint64_t test05_sample_index_rng_seed = 0;
   uint64_t test05_schedule_rng_seed = 0;
-
-  /// \brief Load mlperf parameter config from file.
-  int FromConfig(const std::string &path, const std::string &model,
-                 const std::string &scenario, int conf_type = 1);
   /**@}*/
 
   // ==================================
@@ -272,6 +268,10 @@ struct TestSettings {
   bool infer_token_latencies = false;
   uint64_t token_latency_scaling_factor;
   /**@}*/
+
+  /// \brief Load mlperf parameter config from file.
+  int FromConfig(const std::string &path, const std::string &model,
+                 const std::string &scenario, int conf_type = 1);
 };
 
 ///
diff --git a/multimodal/vl2l/.gitignore b/multimodal/qwen3-vl/.gitignore
similarity index 100%
rename from multimodal/vl2l/.gitignore
rename to multimodal/qwen3-vl/.gitignore
diff --git a/multimodal/qwen3-vl/README.md b/multimodal/qwen3-vl/README.md
new file mode 100644
index 0000000000..37274b2f45
--- /dev/null
+++ b/multimodal/qwen3-vl/README.md
@@ -0,0 +1,280 @@
+# Reference Implementation for the Qwen3-VL (Q3VL) Benchmark 
+
+## Quick Start
+
+This guide demonstrates how you can run the benchmark on your local machine.
+
+### Create a Conda environment
+
+Follow [this link](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions)
+on how to install Miniconda on your host machine. Then, you can create a new conda 
+environment via:
+
+```bash
+conda create -n mlperf-inf-mm-q3vl python=3.12
+```
+
+### Install the Q3VL benchmarking CLI
+
+#### For users
+
+Install `mlperf-inf-mm-q3vl` with:
+
+```bash
+pip install git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/
+```
+
+#### For developers
+
+Clone the MLPerf Inference repo via:
+
+```bash
+git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf-inference
+```
+
+Then enter the repo: 
+
+```bash
+cd mlperf-inference/
+```
+
+Install `mlperf-inf-mm-q3vl` and the development tools with:
+
+- On Bash
+```bash
+pip install -e multimodal/qwen3-vl/[dev]
+```
+- On Zsh
+```zsh
+pip install -e multimodal/qwen3-vl/"[dev]"
+```
+
+### Post Q3VL benchmarking CLI installation 
+
+After installation, you can check the CLI flags that `mlperf-inf-mm-q3vl` can take with:
+
+```bash
+mlperf-inf-mm-q3vl --help
+```
+
+You can enable shell autocompletion for `mlperf-inf-mm-q3vl` with:
+
+```bash
+mlperf-inf-mm-q3vl --install-completion
+```
+
+> [!NOTE]
+> Shell auto-completion will take effect once you restart the terminal.
+
+### Start an inference endpoint on your local host machine with vLLM
+
+Please refer to [this guide on how to launch vLLM for various Qwen3 VL MoE models](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3-VL.html).
+
+```bash
+docker run --gpus all \                                 # Use all the GPUs on this host machine.
+    -v ~/.cache/huggingface:/root/.cache/huggingface \  # Use the HuggingFace cache from your host machine.
+    -p 8000:8000 \                                      # This assumes the endpoint will use port 8000.
+    --ipc=host \                                        # The container can access and utilize the host's IPC mechanisms (e.g., shared memory).
+    vllm/vllm-openai:nightly \                          # You can also use the `:latest` container or a specific release.
+        --model Qwen/Qwen3-VL-235B-A22B-Instruct \      # Specifies the model for vLLM to deploy.
+        --tensor-parallel-size 8 \                      # 8-way tensor-parallel inference across 8 GPUs.
+        --limit-mm-per-prompt.video 0                   # The input requests will contain images only (i.e., no videos).
+```
+
+### Run the benchmark for the Offline scenario
+
+Performance only mode:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario offline --settings.test.mode performance_only
+```
+
+Accuracy only mode:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario offline --settings.test.mode accuracy_only
+```
+
+### Run the benchmark for the Server scenario
+
+Performance only mode:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario server --settings.test.mode performance_only
+```
+
+Accuracy only mode:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario server --settings.test.mode accuracy_only
+```
+
+### Pass in `user.conf`
+
+You can pass in a `user.conf` file through `--settings.user_conf.path`, such that the
+LoadGen parameters provided through the CLI will be overridden by the `user.conf` 
+provided by you and the `mlperf.conf` inside the LoadGen. An example `user.conf` file
+is included: [example_user.conf](./example_user.conf). As such, you can run the
+benchmark with `user.conf` via:
+
+```bash
+mlperf-inf-mm-q3vl benchmark endpoint \
+  --settings.test.scenario <scenario> \
+  --settings.test.mode <mode> \
+  --settings.user_conf.path example_user.conf
+```
+
+### Evalute the response quality
+
+You should pass the `mlperf_log_accuracy.json` file (generated by LoadGen) to the
+`--filename` flag of the `mlperf-inf-mm-q3vl evaluate` command.
+
+```bash
+mlperf-inf-mm-q3vl evaluate --filename output/mlperf_log_accuracy.json
+```
+
+## Docker
+
+[docker/](docker/) provides examples of Dockerfiles that install the Q3VL benchmarking
+CLI into the container images of the inference engine. This is useful when you have to
+run both the inference engine and the Q3VL benchmarking CLI inside the same container,
+for example, in a situation where you must use a GPU cluster managed by 
+[Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and
+[pyxis](https://github.com/NVIDIA/pyxis).
+
+As an illustrative example, assuming that you are at the root directory of the MLPerf 
+Inference repo:
+
+1. You can build a container image against the vLLM's
+`vllm/vllm-openai:v0.12.0` release by
+
+```bash
+docker build \
+    --build-arg BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0 \
+    --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=multimodal/qwen3-vl \
+    -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \
+    -t mlperf-inf-mm-q3vl:vllm-openai-v0.12.0 \
+    .
+```
+> [!NOTE]
+> `MLPERF_INF_MM_Q3VL_INSTALL_URL` can also take in a remote GitHub location, such as
+> `git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/`.
+
+2. Afterwards, you can start the container in the interactive mode by
+
+```bash
+docker run --rm -it --gpus all -v ~/.cache:/root/.cache --ipc=host mlperf-inf-mm-q3vl:vllm-openai-v0.12.0
+```
+
+### Benchmark against vLLM inside the container
+
+If you are running `mlperf-inf-mm-q3vl` inside a local environment that has access to
+vLLM (such as inside a container that was created using the 
+[docker/vllm-cuda.Dockerfile](docker/vllm-cuda.Dockerfile)), you can use a single
+`mlperf-inf-mm-q3vl benchmark vllm` command to achieve:
+
+1. Deploy an endpoint using vLLM.
+2. Wait for the endpoint to be healthy.
+3. Run the benchmark against that endpoint.
+
+For example, inside the container, you can run the Offline scenario Accuracy only
+mode with:
+
+```bash
+mlperf-inf-mm-q3vl benchmark vllm \
+    --settings.test.scenario offline \
+    --settings.test.mode accuracy_only \
+    --settings.user_conf.path example_user.conf \
+    --vllm.cli=--async-scheduling \
+    --vllm.cli=--max-model-len=32768 \
+    --vllm.cli=--max-num-seqs=1024 \
+    --vllm.cli=--compilation-config='{
+        "cudagraph_capture_sizes": [
+            1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128,
+            136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248,
+            256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480,
+            496, 512, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768
+        ]
+    }' \
+    --vllm.cli=--limit-mm-per-prompt.video=0 \
+    --vllm.cli=--tensor-parallel-size=8 
+```
+
+## Slurm
+
+[scripts/slurm/](scripts/slurm/) provide example scripts of running both the benchmark 
+and the response quality evaluation in a GPU cluster managed by 
+[Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and
+[pyxis](https://github.com/NVIDIA/pyxis). Specifically,
+
+- [scripts/slurm/benchmark.sh](scripts/slurm/benchmark.sh) is a sbatch script that 
+  runs the benchmarking job.
+- [scripts/slurm/evaluate.sh](scripts/slurm/evaluate.sh) is a sbatch script that runs
+  the evaluation job.
+- [scripts/slurm/submit.sh](scripts/slurm/submit.sh) is a Bash script that submits both
+  jobs, where the evaluation job would only run if the benchmarking job has succeeded.
+
+You can check the CLI flags that [scripts/slurm/submit.sh](scripts/slurm/submit.sh) can
+take via:
+
+```bash
+bash submit.sh --help
+```
+
+> [!NOTE]
+> Slurm clusters are often highly customized per organization. If you are unfamiliar
+> with Slurm, you should check with the cluster administrator of your organization
+> first, get a good understanding of what those example scripts do, and adapt the 
+> example scripts to the specific settings for the Slurm cluster that you are going
+> to use, before you try to launch any jobs.
+
+## Reference Implementation Specification
+
+- v6.0 Round
+  - vLLM version: [v0.12.0](https://github.com/vllm-project/vllm/releases/tag/v0.12.0)
+  - Model:
+    - [Qwen/Qwen3-VL-235B-A22B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct)
+    - Commit SHA: [710c13861be6c466e66de3f484069440b8f31389](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct/tree/710c13861be6c466e66de3f484069440b8f31389)
+  - Dataset:
+    - [Shopify/product-catalogue](https://huggingface.co/datasets/Shopify/product-catalogue)
+    - Commit SHA: [d5c517c509f5aca99053897ef1de797d6d7e5aa5](https://huggingface.co/datasets/Shopify/product-catalogue/tree/d5c517c509f5aca99053897ef1de797d6d7e5aa5)
+    - Both the `train` and the `test` splits are used and concatenated in that order.
+    - Total number of samples: `48289`.
+  - Guided decoding is not used.
+  - Sampling parameters:
+    - Frequency penalty: `0.0` 
+    - Presence penalty: `0.0`
+    - Temperature: `1.0`
+    - Top-P: `1.0`
+    - Top-K: `0`
+    - Min-P: `0.0`
+    - Repetition penalty: `1.0`
+  - Constraints:
+    - Model quality:
+      - Category Hierarchical F1 Score >= `0.7824`. This is the 99% recovery of 
+        `0.7903037` which is the mean category hierarchical F1 score across 10 runs on 
+        [the BF16 version of the model](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct).
+        The standard deviation across those 10 runs is `0.0002250412555`.
+    - Server Scenario:
+      - Target latency is used as the constraint, instead of Time to First Token (TTFT)
+        or Time per Output Token (TPOT) latencies. 
+      - Target latency percentile = `0.99`.
+      - Target latency $\le$ 12 seconds.
+    - Offline Scenario:
+      - Number of samples in the query $\ge$ `48289` (i.e., every sample in the entire
+        dataset would be send to the VLM endpoint at least once).
+    - Performance sample count: `48289` (i.e., the entire dataset will be loaded into
+      the host memory, which takes ~6.39 GB). 
+    - Testing duration $\ge$ 10 mins.
+    - Sample concatenation permutation is enabled.
+
+
+## Developer Guide
+
+### Linting
+
+You can lint the Q3VL benchmark source code by running the following script:
+
+```bash
+bash multimodal/qwen3-vl/scripts/linters.sh
+```
\ No newline at end of file
diff --git a/multimodal/vl2l/docker/vllm-cuda.Dockerfile b/multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile
similarity index 65%
rename from multimodal/vl2l/docker/vllm-cuda.Dockerfile
rename to multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile
index 0c7597ce76..a54bda1364 100644
--- a/multimodal/vl2l/docker/vllm-cuda.Dockerfile
+++ b/multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile
@@ -9,33 +9,33 @@
 #      docker build -t myimage .
 #
 # 2. Install from a different git URL or branch:
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=git+https://github.com/USER/REPO.git@BRANCH#subdirectory=multimodal/vl2l \
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=git+https://github.com/USER/REPO.git@BRANCH#subdirectory=multimodal/qwen3-vl \
 #                   -t myimage .
 #
 # 3. Install from local directory (build from repo root with git auto-detection):
 #    (Version number will be auto-detected from git if the build context includes .git)
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=multimodal/vl2l \
-#                   -f multimodal/vl2l/docker/vllm-cuda.Dockerfile \
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=multimodal/qwen3-vl \
+#                   -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \
 #                   -t myimage .
 #
-# 4. Install from local directory (build from multimodal/vl2l subdirectory):
+# 4. Install from local directory (build from multimodal/qwen3-vl subdirectory):
 #    (No .git in subdirectory, will use fallback version "0.0.0.dev0")
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=. \
-#                   -f multimodal/vl2l/docker/vllm-cuda.Dockerfile \
-#                   -t myimage multimodal/vl2l
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=. \
+#                   -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \
+#                   -t myimage multimodal/qwen3-vl
 #
-# 5. Install from local directory when pwd is already multimodal/vl2l:
+# 5. Install from local directory when pwd is already multimodal/qwen3-vl:
 #    (No .git in subdirectory, will use fallback version "0.0.0.dev0")
-#      cd multimodal/vl2l
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=. \
+#      cd multimodal/qwen3-vl
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=. \
 #                   -f docker/vllm-cuda.Dockerfile \
 #                   -t myimage .
 #
 # 6. Install from local directory with a custom fallback version:
 #    (Override the default "0.0.0.dev0" version when git is not available)
-#      cd multimodal/vl2l
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=. \
-#                   --build-arg MLPERF_INF_MM_VL2L_VERSION=1.0.0 \
+#      cd multimodal/qwen3-vl
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=. \
+#                   --build-arg MLPERF_INF_MM_Q3VL_VERSION=1.0.0 \
 #                   -f docker/vllm-cuda.Dockerfile \
 #                   -t myimage .
 #
@@ -45,29 +45,29 @@
 #
 # ============================================================================
 
-ARG BASE_IMAGE_URL=vllm/vllm-openai:nightly
+ARG BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0
 FROM ${BASE_IMAGE_URL}
 
-# MLPERF_INF_MM_VL2L_INSTALL_URL can be either:
+# MLPERF_INF_MM_Q3VL_INSTALL_URL can be either:
 #   1. A git URL (default): git+https://github.com/...
-#   2. A local directory path relative to the build context (e.g., multimodal/vl2l)
+#   2. A local directory path relative to the build context (e.g., multimodal/qwen3-vl)
 #      Note: The build context is the directory you pass to `docker build` (the final arg)
-#            MLPERF_INF_MM_VL2L_INSTALL_URL must be a valid path inside that build context
-ARG MLPERF_INF_MM_VL2L_INSTALL_URL=git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/vl2l
+#            MLPERF_INF_MM_Q3VL_INSTALL_URL must be a valid path inside that build context
+ARG MLPERF_INF_MM_Q3VL_INSTALL_URL=git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl
 
 # Temporary directory inside the container where the build context will be copied
 # Only used when installing from a local directory path
-ARG BUILD_CONTEXT_DIR=/tmp/mm_vl2l_build_context
+ARG BUILD_CONTEXT_DIR=/tmp/mm_q3vl_build_context
 
 # Fallback version to use when building from local directory without git metadata
 # setuptools-scm will first try to detect version from .git, and use this as fallback
 # Must be a valid PEP 440 version string (e.g., "0.0.0.dev0", "1.0.0", "0.1.0.dev1")
 # Can be overridden at build time with --build-arg
-ARG MLPERF_INF_MM_VL2L_VERSION=0.0.0.dev0
+ARG MLPERF_INF_MM_Q3VL_VERSION=0.0.0.dev0
 
 # Install
 # - git (required for installing "git+..." dependencies to work)
-# - tmux (for `vllm serve` and `mlperf-inf-mm-vl2l` in different tmux sessions)
+# - tmux (for `vllm serve` and `mlperf-inf-mm-q3vl` in different tmux sessions)
 # - vim (for editing files in the container)
 RUN apt-get update && \
     apt-get install -y git tmux vim && \
@@ -79,25 +79,25 @@ RUN apt-get update && \
 #ENV LD_LIBRARY_PATH=/usr/local/lib/python3.12/dist-packages/torch/lib:$LD_LIBRARY_PATH
 
 # Copy build context.
-# This will be used only if MLPERF_INF_MM_VL2L_INSTALL_URL is a local path.
+# This will be used only if MLPERF_INF_MM_Q3VL_INSTALL_URL is a local path.
 COPY . ${BUILD_CONTEXT_DIR}/
 
-# Install the mlperf-inference-multimodal-vl2l package.
+# Install the mlperf-inference-multimodal-q3vl package.
 # We use --system to install into the container's global python environment.
-# Detect if MLPERF_INF_MM_VL2L_INSTALL_URL is a git URL or a local path:
-RUN if echo "${MLPERF_INF_MM_VL2L_INSTALL_URL}" | grep -q "^git+"; then \
-        echo "Installing from git URL: ${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \
-        uv pip install --system --no-cache --verbose "${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \
+# Detect if MLPERF_INF_MM_Q3VL_INSTALL_URL is a git URL or a local path:
+RUN if echo "${MLPERF_INF_MM_Q3VL_INSTALL_URL}" | grep -q "^git+"; then \
+        echo "Installing from git URL: ${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \
+        uv pip install --system --no-cache --verbose "${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \
     else \
-        echo "Installing from local path: ${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \
+        echo "Installing from local path: ${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \
         # Check if the package directory is inside a git repository \
-        if cd "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_VL2L_INSTALL_URL}" && git rev-parse --git-dir > /dev/null 2>&1; then \
+        if cd "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_Q3VL_INSTALL_URL}" && git rev-parse --git-dir > /dev/null 2>&1; then \
             echo "Git repository detected, setuptools-scm will detect version automatically"; \
         else \
-            echo "Not in a git repository, using fallback version: ${MLPERF_INF_MM_VL2L_VERSION}"; \
-            export SETUPTOOLS_SCM_PRETEND_VERSION_FOR_MLPERF_INFERENCE_MULTIMODAL_VL2L="${MLPERF_INF_MM_VL2L_VERSION}"; \
+            echo "Not in a git repository, using fallback version: ${MLPERF_INF_MM_Q3VL_VERSION}"; \
+            export SETUPTOOLS_SCM_PRETEND_VERSION_FOR_MLPERF_INF_MM_Q3VL="${MLPERF_INF_MM_Q3VL_VERSION}"; \
         fi; \
-        uv pip install --system --no-cache --verbose "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \
+        uv pip install --system --no-cache --verbose "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \
     fi && \
     rm -rf "${BUILD_CONTEXT_DIR}"
 
diff --git a/multimodal/qwen3-vl/example_user.conf b/multimodal/qwen3-vl/example_user.conf
new file mode 100644
index 0000000000..615c92fe67
--- /dev/null
+++ b/multimodal/qwen3-vl/example_user.conf
@@ -0,0 +1,7 @@
+*.Offline.target_qps = 80.4816666667
+*.Offline.min_duration = 600000
+*.Offline.min_query_count = 48289
+
+*.Server.target_qps = 5.0
+*.Server.min_duration = 600000
+*.Server.min_query_count = 48289
\ No newline at end of file
diff --git a/multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb b/multimodal/qwen3-vl/notebooks/shopify-global-catalogue.ipynb
similarity index 99%
rename from multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb
rename to multimodal/qwen3-vl/notebooks/shopify-global-catalogue.ipynb
index d973c74014..682398ac37 100644
--- a/multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb
+++ b/multimodal/qwen3-vl/notebooks/shopify-global-catalogue.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "id": "f194bfdf-c9f1-4738-bdb5-258dd4bc05f0",
    "metadata": {},
    "outputs": [],
@@ -29,7 +29,7 @@
     "from io import BytesIO\n",
     "import base64\n",
     "import pprint\n",
-    "from mlperf_inference_multimodal_vl2l.task import Task\n",
+    "from mlperf_inf_mm_q3vl.task import Task\n",
     "from openai import AsyncOpenAI, DefaultAioHttpClient\n",
     "import numpy as np\n",
     "import json\n",
@@ -451,12 +451,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "id": "b6aa7372",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mlperf_inference_multimodal_vl2l.task import ProductMetadata\n",
+    "from mlperf_inf_mm_q3vl.task import ProductMetadata\n",
     "\n",
     "def build_messages(sample):\n",
     "    image_file = BytesIO()\n",
diff --git a/multimodal/vl2l/pyproject.toml b/multimodal/qwen3-vl/pyproject.toml
similarity index 82%
rename from multimodal/vl2l/pyproject.toml
rename to multimodal/qwen3-vl/pyproject.toml
index 1d1d90ec75..255b3f4e16 100644
--- a/multimodal/vl2l/pyproject.toml
+++ b/multimodal/qwen3-vl/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
-name = "mlperf-inference-multimodal-vl2l"
-description = "The reference implementation for the vision-language-to-language (VL2L) benchmark in MLPerf Inference"
+name = "mlperf-inf-mm-q3vl"
+description = "The reference implementation for the Qwen3-VL (Q3VL) benchmark in MLPerf Inference"
 readme = "README.md"
 classifiers = [
   "Programming Language :: Python :: 3",
@@ -30,10 +30,10 @@ dynamic = ["version"]
 dev = ["black", "ruff", "mypy", "shellcheck-py", "pytest"]
 
 [project.scripts]
-mlperf-inf-mm-vl2l = "mlperf_inference_multimodal_vl2l.cli:app"
+mlperf-inf-mm-q3vl = "mlperf_inf_mm_q3vl.cli:app"
 
 [project.urls]
-Homepage = "https://github.com/mlcommons/inference/multimodal/vl2l"
+Homepage = "https://github.com/mlcommons/inference/multimodal/qwen3-vl"
 
 [build-system]
 requires = ["setuptools>=80", "setuptools-scm[simple]>=8"]
@@ -43,7 +43,7 @@ build-backend = "setuptools.build_meta"
 where = ["src"]
 
 [tool.setuptools.package-data]
-"mlperf_inference_multimodal_vl2l" = ["py.typed"]
+"mlperf_inf_mm_q3vl" = ["py.typed"]
 
 [tool.setuptools_scm]
 root = "../../"
diff --git a/multimodal/vl2l/scripts/linters.sh b/multimodal/qwen3-vl/scripts/linters.sh
similarity index 100%
rename from multimodal/vl2l/scripts/linters.sh
rename to multimodal/qwen3-vl/scripts/linters.sh
diff --git a/multimodal/qwen3-vl/scripts/slurm/benchmark.sh b/multimodal/qwen3-vl/scripts/slurm/benchmark.sh
new file mode 100644
index 0000000000..00167cd3b3
--- /dev/null
+++ b/multimodal/qwen3-vl/scripts/slurm/benchmark.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#SBATCH --time=4:00:00
+#SBATCH --partition=batch
+#SBATCH --tasks=1
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --output=benchmark-slurm-output-%j.txt
+#SBATCH --error=benchmark-slurm-error-%j.txt
+
+set -eux
+set -o pipefail
+
+mkdir -p "${OUTPUT_HOST_DIR}"/"${SLURM_JOB_ID}"
+
+srun \
+    --container-image="${CONTAINER_IMAGE}" \
+    --container-mounts="${CACHE_HOST_DIR}":"${CACHE_CONTAINER_DIR}","${OUTPUT_HOST_DIR}":"${OUTPUT_CONTAINER_DIR}" \
+    --no-container-mount-home \
+    mlperf-inf-mm-q3vl benchmark vllm \
+        --settings.test.scenario="${SCENARIO}" \
+        --settings.test.mode="${MODE}" \
+        --settings.test.server_expected_qps="${SERVER_EXPECTED_QPS}" \
+        --vllm.model.repo_id="${MODEL_REPO_ID}" \
+        --vllm.cli=--async-scheduling \
+        --vllm.cli=--max-model-len=32768 \
+        --vllm.cli=--limit-mm-per-prompt.video=0 \
+        --vllm.cli=--tensor-parallel-size="${TENSOR_PARALLEL_SIZE}" \
+        --settings.logging.log_output.outdir="${OUTPUT_CONTAINER_DIR}"/"${SLURM_JOB_ID}" 
\ No newline at end of file
diff --git a/multimodal/qwen3-vl/scripts/slurm/evaluate.sh b/multimodal/qwen3-vl/scripts/slurm/evaluate.sh
new file mode 100644
index 0000000000..54615f2e33
--- /dev/null
+++ b/multimodal/qwen3-vl/scripts/slurm/evaluate.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH --time=1:00:00
+#SBATCH --partition=cpu_short
+#SBATCH --nodes=1
+#SBATCH --tasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem-per-cpu=16G
+#SBATCH --output=evaluate-slurm-output-%j.txt
+#SBATCH --error=evaluate-slurm-error-%j.txt
+
+set -eux
+set -p pipefail
+
+srun \
+    --container-image="${CONTAINER_IMAGE}" \
+    --container-mounts="${CACHE_HOST_DIR}":"${CACHE_CONTAINER_DIR}","${OUTPUT_HOST_DIR}":"${OUTPUT_CONTAINER_DIR}" \
+    --no-container-mount-home \
+    --container-env=NVIDIA_VISIBLE_DEVICES \
+    mlperf-inf-mm-q3vl evaluate \
+        --filename="${OUTPUT_CONTAINER_DIR}"/"${BENCHMARK_JOB_ID}"/mlperf_log_accuracy.json
\ No newline at end of file
diff --git a/multimodal/qwen3-vl/scripts/slurm/submit.sh b/multimodal/qwen3-vl/scripts/slurm/submit.sh
new file mode 100644
index 0000000000..8e07336d7f
--- /dev/null
+++ b/multimodal/qwen3-vl/scripts/slurm/submit.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+
+set -eux
+set -o pipefail
+
+DEFAULT_CONTAINER_IMAGE=""
+container_image=${DEFAULT_CONTAINER_IMAGE}
+
+DEFAULT_MODEL_REPO_ID=Qwen/Qwen3-VL-235B-A22B-Instruct
+model_repo_id=${DEFAULT_MODEL_REPO_ID}
+
+DEFAULT_SCENARIO=offline
+scenario=${DEFAULT_SCENARIO}
+
+DEFAULT_MODE=accuracy_only
+mode=${DEFAULT_MODE}
+
+DEFAULT_SERVER_EXPECTED_QPS=5
+server_expected_qps=${DEFAULT_SERVER_EXPECTED_QPS}
+
+DEFAULT_TENSOR_PARALLEL_SIZE=8
+tensor_parallel_size=${DEFAULT_TENSOR_PARALLEL_SIZE}
+
+DEFAULT_CACHE_HOST_DIR=""
+cache_host_dir=${DEFAULT_CACHE_HOST_DIR}
+
+DEFAULT_OUTPUT_HOST_DIR=$(pwd)/outputs
+output_host_dir=${DEFAULT_OUTPUT_HOST_DIR}
+
+DEFAULT_SLURM_ACCOUNT=""
+slurm_account=${DEFAULT_SLURM_ACCOUNT}
+
+DEFAULT_BENCHMARK_SLURM_PARTITION=""
+benchmark_slurm_partition=${DEFAULT_BENCHMARK_SLURM_PARTITION}
+
+DEFAULT_EVALUATE_SLURM_PARTITION=""
+evaluate_slurm_partition=${DEFAULT_EVALUATE_SLURM_PARTITION}
+
+function _exit_with_help_msg() {
+    cat <<EOF
+Submit a benchmarking (and optionally, an evaluation) job(s) for the Qwen3-VL (Q3VL) benchmark.
+
+Usage: ${BASH_SOURCE[0]}
+    [-ci  | --container-image]     Container image to run the benchmark (default: ${DEFAULT_CONTAINER_IMAGE}).
+    [-mri | --model-repo-id]       HuggingFace repo ID of the model to benchmark (default: ${DEFAULT_MODEL_REPO_ID}).
+    [-s | --scenario]              Benchmark scenario (default: ${DEFAULT_SCENARIO}).
+    [-m | --mode]                  Benchmark mode (default: ${DEFAULT_MODE}).
+    [-seq | --server-expected-qps] The expected QPS for the server scenario (default: ${DEFAULT_SERVER_EXPECTED_QPS}).
+    [-tps | --tensor-parallel-size] Tensor parallelism size for the model deployment (default: ${DEFAULT_TENSOR_PARALLEL_SIZE}).
+    [-chd | --cache-host-dir]      Host directory of the ".cache" directory to which HuggingFace will dump the dataset and the model checkpoint, and vLLM will dump compilation artifacts (default: ${DEFAULT_CACHE_HOST_DIR}).
+    [-ohd | --output-host-dir]     Host directory to which the benchmark and evaluation results will be dumped (default: ${DEFAULT_OUTPUT_HOST_DIR}).
+    [-sa | --slurm-account]        Slurm account for submitting the benchmark and evaluation jobs (default: ${DEFAULT_SLURM_ACCOUNT}).
+    [-bsp | --benchmark-slurm-partition] Slurm partition for submitting the benchmarking job; usually a partition with nodes that have GPUs (default: ${DEFAULT_BENCHMARK_SLURM_PARTITION}).
+    [-esp | --evaluate-slurm-partition] Slurm partition for submitting the evaluation job; usually a partition with nodes that have CPUs only (default: ${DEFAULT_EVALUATE_SLURM_PARTITION}).
+    [-h | --help]     Print this help message.
+EOF
+    if [ -n "$1" ]; then
+        echo "$(tput bold setab 1)$1$(tput sgr0)"
+    fi
+    exit "$2"
+}
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+    -ci | --container-image)
+        container_image=$2
+        shift
+        shift
+        ;;
+    -ci=* | --container-image=*)
+        container_image=${1#*=}
+        shift
+        ;;
+    -mri | --model-repo-id)
+        model_repo_id=$2
+        shift
+        shift
+        ;;
+    -mri=* | --model-repo-id=*)
+        model_repo_id=${1#*=}
+        shift
+        ;;
+    -s | --scenario)
+        scenario=$2
+        shift
+        shift
+        ;;
+    -s=* | --scenario=*)
+        scenario=${1#*=}
+        shift
+        ;;
+    -m | --mode)
+        mode=$2
+        shift
+        shift
+        ;;
+    -m=* | --mode=*)
+        mode=${1#*=}
+        shift
+        ;;
+    -seq | --server-expected-qps)
+        server_expected_qps=$2
+        shift
+        shift
+        ;;
+    -seq=* | --server-expected-qps=*)
+        server_expected_qps=${1#*=}
+        shift
+        ;;
+    -tps | --tensor-parallel-size)
+        tensor_parallel_size=$2
+        shift
+        shift
+        ;;
+    -tps=* | --tensor-parallel-size=*)
+        tensor_parallel_size=${1#*=}
+        shift
+        ;;
+    -chd | --cache-host-dir)
+        cache_host_dir=$2
+        shift
+        shift
+        ;;
+    -chd=* | --cache-host-dir=*)
+        cache_host_dir=${1#*=}
+        shift
+        ;;
+    -ohd | --output-host-dir)
+        output_host_dir=$2
+        shift
+        shift
+        ;;
+    -ohd=* | --output-host-dir=*)
+        output_host_dir=${1#*=}
+        shift
+        ;;
+    -sa | --slurm-account)
+        slurm_account=$2
+        shift
+        shift
+        ;;
+    -sa=* | --slurm-account=*)
+        slurm_account=${1#*=}
+        shift
+        ;;
+    -bsp | --benchmark-slurm-partition)
+        benchmark_slurm_partition=$2
+        shift
+        shift
+        ;;
+    -bsp=* | --benchmark-slurm-partition=*)
+        benchmark_slurm_partition=${1#*=}
+        shift
+        ;;
+    -esp | --evaluate-slurm-partition)
+        evaluate_slurm_partition=$2
+        shift
+        shift
+        ;;
+    -esp=* | --evaluate-slurm-partition=*)
+        evaluate_slurm_partition=${1#*=}
+        shift
+        ;;
+    -h | --help)
+        _exit_with_help_msg "" 0
+        ;;
+    *)
+        _exit_with_help_msg "[ERROR] Unknown option: $1" 1
+        ;;
+    esac
+done
+
+if [[ -z "${container_image}" ]]; then
+    _exit_with_help_msg "[ERROR] -ci or --container-image is required." 1
+fi
+
+if [[ -z "${cache_host_dir}" ]]; then
+    _exit_with_help_msg "[ERROR] -chd or --cache-host-dir is required." 1
+fi
+
+if [[ -z "${slurm_account}" ]]; then
+    _exit_with_help_msg "[ERROR] -sa or --slurm-account is required." 1
+fi
+
+if [[ -z "${benchmark_slurm_partition}" ]]; then
+    _exit_with_help_msg "[ERROR] -bsp or --benchmark-slurm-partition is required." 1
+fi
+
+if [[ -z "${evaluate_slurm_partition}" ]]; then
+    _exit_with_help_msg "[ERROR] -esp or --evaluate-slurm-partition is required." 1
+fi
+
+cache_container_dir=/root/.cache
+output_container_dir=/outputs
+
+mkdir -p "${output_host_dir}"
+
+benchmark_job_id=$(
+    CACHE_HOST_DIR="${cache_host_dir}" \
+    CACHE_CONTAINER_DIR="${cache_container_dir}" \
+    OUTPUT_HOST_DIR="${output_host_dir}" \
+    OUTPUT_CONTAINER_DIR="${output_container_dir}" \
+    CONTAINER_IMAGE="${container_image}" \
+    SCENARIO="${scenario}" \
+    MODE="${mode}" \
+    SERVER_EXPECTED_QPS="${server_expected_qps}" \
+    TENSOR_PARALLEL_SIZE="${tensor_parallel_size}" \
+    MODEL_REPO_ID="${model_repo_id}" \
+    sbatch --parsable \
+        --account="${slurm_account}" \
+        --partition="${benchmark_slurm_partition}" \
+        --gres=gpu:"${tensor_parallel_size}" \
+        benchmark.sh
+)
+
+if [[ "${mode}" == "accuracy_only" ]]; then
+    CACHE_HOST_DIR="${cache_host_dir}" \
+    CACHE_CONTAINER_DIR="${cache_container_dir}" \
+    OUTPUT_HOST_DIR="${output_host_dir}" \
+    OUTPUT_CONTAINER_DIR="${output_container_dir}" \
+    CONTAINER_IMAGE="${container_image}" \
+    BENCHMARK_JOB_ID="${benchmark_job_id}" \
+    NVIDIA_VISIBLE_DEVICES=void \
+    sbatch \
+        --dependency=afterok:"${benchmark_job_id}" \
+        --account="${slurm_account}" \
+        --partition="${evaluate_slurm_partition}" \
+        evaluate.sh
+fi
\ No newline at end of file
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/__init__.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/__init__.py
similarity index 52%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/__init__.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/__init__.py
index 61579c28c8..f964062b1e 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/__init__.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/__init__.py
@@ -1,4 +1,4 @@
-"""Reference Implementation for the Vision-language-to-language (VL2L) Benchmark."""
+"""Reference Implementation for the Qwen3-VL (Q3VL) Benchmark."""
 
 from __future__ import annotations
 
@@ -6,4 +6,4 @@
 from importlib.metadata import PackageNotFoundError, version
 
 with contextlib.suppress(PackageNotFoundError):
-    __version__ = version("mlperf-inference-multimodal-vl2l")
+    __version__ = version("mlperf-inf-mm-q3vl")
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
similarity index 83%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
index b487fd51e3..23fb1f7a72 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
@@ -1,4 +1,4 @@
-"""The CLI definition for the VL2L benchmark."""
+"""The CLI definition for the Qwen3-VL (Q3VL) benchmark."""
 
 from __future__ import annotations
 
@@ -21,7 +21,7 @@
 app.add_typer(
     benchmark_app,
     name="benchmark",
-    help="Main CLI for running the VL2L benchmark.",
+    help="Main CLI for running the Qwen3-VL (Q3VL) benchmark.",
 )
 
 
@@ -80,13 +80,18 @@ def _run_benchmark(
     endpoint: Endpoint,
     random_seed: int,
 ) -> None:
-    """Run the VL2L benchmark."""
-    logger.info("Running VL2L benchmark with settings: {}", settings)
-    logger.info("Running VL2L benchmark with dataset: {}", dataset)
+    """Run the Qwen3-VL (Q3VL) benchmark."""
     logger.info(
-        "Running VL2L benchmark with OpenAI API endpoint: {}",
-        endpoint)
-    logger.info("Running VL2L benchmark with random seed: {}", random_seed)
+        "Running Qwen3-VL (Q3VL) benchmark with settings: {}",
+        settings)
+    logger.info("Running Qwen3-VL (Q3VL) benchmark with dataset: {}", dataset)
+    logger.info(
+        "Running Qwen3-VL (Q3VL) benchmark with OpenAI API endpoint: {}",
+        endpoint,
+    )
+    logger.info(
+        "Running Qwen3-VL (Q3VL) benchmark with random seed: {}",
+        random_seed)
     test_settings, log_settings = settings.to_lgtype()
     task = ShopifyGlobalCatalogue(
         dataset=dataset,
@@ -96,9 +101,9 @@ def _run_benchmark(
     )
     sut = task.construct_sut()
     qsl = task.construct_qsl()
-    logger.info("Starting the VL2L benchmark with LoadGen...")
+    logger.info("Starting the Qwen3-VL (Q3VL) benchmark with LoadGen...")
     lg.StartTestWithLogSettings(sut, qsl, test_settings, log_settings)
-    logger.info("The VL2L benchmark with LoadGen completed.")
+    logger.info("The Qwen3-VL (Q3VL) benchmark with LoadGen completed.")
     lg.DestroyQSL(qsl)
     lg.DestroySUT(sut)
 
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/deploy.py
similarity index 80%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/deploy.py
index 8db6acfa8a..66303ba6fd 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/deploy.py
@@ -16,6 +16,7 @@
 from .log import get_log_file_path
 
 if TYPE_CHECKING:
+    from pathlib import Path
     from types import TracebackType
 
     from .schema import EndpointToDeploy, Settings, VllmEndpoint
@@ -100,11 +101,17 @@ def _startup(self) -> None:
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def _failfast(self) -> None:
+        """Raise an exception if the endpoint is already detected to be dead."""
+        raise NotImplementedError
+
     def _wait_for_ready(self) -> None:
         """Wait for the endpoint to be ready."""
         health_url = self.endpoint.url.rstrip("/v1") + "/health"
         start_time = time.time()
         while time.time() - start_time < self.endpoint.startup_timeout.total_seconds():
+            self._failfast()
             logger.info(
                 "Waiting {:0.2f} seconds for endpoint to be ready...",
                 time.time() - start_time,
@@ -134,6 +141,31 @@ def _shutdown(self) -> None:
         raise NotImplementedError
 
 
+class LocalProcessNotStartedError(RuntimeError):
+    """The exception raised when the local process is not started yet."""
+
+    def __init__(self) -> None:
+        """Initialize the exception."""
+        super().__init__("Local process is not started yet.")
+
+
+class LocalProcessDeadError(RuntimeError):
+    """The exception raised when the local process is already detected to be dead."""
+
+    def __init__(
+        self,
+        returncode: int,
+        stdout_file_path: Path,
+        stderr_file_path: Path,
+    ) -> None:
+        """Initialize the exception."""
+        super().__init__(
+            f"Local process has already terminated with return code {returncode}. "
+            f"Please check the logs in {stdout_file_path} and "
+            f"{stderr_file_path} for more details.",
+        )
+
+
 class LocalProcessDeployer(EndpointDeployer):
     """Deploy and manage an endpoint that is powered by a local process."""
 
@@ -146,6 +178,14 @@ def __init__(self, endpoint: EndpointToDeploy, settings: Settings) -> None:
         """
         super().__init__(endpoint=endpoint, settings=settings)
         self._process: subprocess.Popen | None = None
+        self._stdout_file_path = get_log_file_path(
+            key=self._stdout_log_file_key,
+            settings=self.settings,
+        )
+        self._stderr_file_path = get_log_file_path(
+            key=self._stderr_log_file_key,
+            settings=self.settings,
+        )
 
     @abstractmethod
     def _build_command(self) -> list[str]:
@@ -172,34 +212,38 @@ def _startup(self) -> None:
             "Starting local process with environment variables: {}",
             os.environ)
 
-        # Get log file paths
-        stdout_file_path = get_log_file_path(
-            key=self._stdout_log_file_key,
-            settings=self.settings,
-        )
-        stderr_file_path = get_log_file_path(
-            key=self._stderr_log_file_key,
-            settings=self.settings,
-        )
-
         # Start the server
         process = subprocess.Popen(  # noqa: S603
             cmd,
-            stdout=stdout_file_path.open("w"),
-            stderr=stderr_file_path.open("w"),
+            stdout=self._stdout_file_path.open("w"),
+            stderr=self._stderr_file_path.open("w"),
             text=True,
         )
 
         logger.info("Started local process with PID: {}", process.pid)
         logger.info(
             "Local process stdout will be logged to: {}",
-            stdout_file_path)
+            self._stdout_file_path,
+        )
         logger.info(
             "Local process stderr will be logged to: {}",
-            stderr_file_path)
+            self._stderr_file_path,
+        )
 
         self._process = process
 
+    def _failfast(self) -> None:
+        """Raise an exception if the local process is already detected to be dead."""
+        if self._process is None:
+            raise LocalProcessNotStartedError
+        returncode = self._process.poll()
+        if returncode is not None:
+            raise LocalProcessDeadError(
+                returncode=returncode,
+                stdout_file_path=self._stdout_file_path,
+                stderr_file_path=self._stderr_file_path,
+            )
+
     def _shutdown(self) -> None:
         """Shut down the local process gracefully."""
         if self._process is None:
@@ -256,12 +300,17 @@ def _build_command(self) -> list[str]:
             "vllm",
             "serve",
             self.endpoint.model.repo_id,
+            "--revision",
+            self.endpoint.model.revision,
             "--host",
             host,
             "--port",
             str(port),
         ]
 
+        if self.endpoint.model.token:
+            cmd.extend(["--hf-token", self.endpoint.model.token])
+
         # Add API key if provided
         if self.endpoint.api_key:
             cmd.extend(["--api-key", self.endpoint.api_key])
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py
similarity index 63%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py
index 2076bdbab8..7bf59ce302 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py
@@ -1,14 +1,15 @@
-"""Task definitions for the VL2L benchmark."""
+"""Task definitions for the Qwen3-VL (Q3VL) benchmark."""
 
 from __future__ import annotations
 
 import json
+import os
+from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
 from datasets import load_dataset
-from hiclass.metrics import f1  # type: ignore[import-untyped]
 from loguru import logger
 from pydantic import ValidationError
 from rapidfuzz import fuzz  # type: ignore[import-untyped]
@@ -16,17 +17,21 @@
 from tabulate import tabulate
 
 if TYPE_CHECKING:
+    from typing import Any
+
     from pydantic import FilePath
 
     from .cli import Dataset as DatasetCLI
 
 from .schema import ProductMetadata
 
-_TRUE_CATEGORY_PAD = "<|__TRUE_CATEGORY_PAD__|>"
 _PRED_CATEGORY_PAD = "<|__PRED_CATEGORY_PAD__|>"
 _PRED_BRAND_PAD = "<|__PRED_BRAND_PAD__|>"
 _CATEGORY_SEPARATOR = " > "
 
+_WORKER_CONTEXT = {}
+_MAX_JOBS = 4
+
 
 def get_hierarchical_components(
     predicted_path: str,
@@ -159,63 +164,42 @@ def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float:
     return f1_score(y_src, y_pred)
 
 
-def calculate_hiclass_f1(
-    data: list[tuple[str, str]],
-    separator: str = _CATEGORY_SEPARATOR,
-) -> float:
-    """Alt method to calculate hierarchical F1.
+def _process_chunk_rnd_brand(args: tuple[str, dict, dict]) -> tuple[str, str]:
+    """Function to process only chunks for random brand predictions.
 
     Args:
-        data: List of tuples of predicted and true values
-        separator: The separator used to split the paths into levels of the category.
-
-    Returs:
-        f1 score
+        args: Tuple containing
     """
-    y_pred_raw = []
-    y_true_raw = []
+    pred_brand, elem, data_source = args
+    # We pass the specific data row needed, or the whole structure if efficient
+    return (pred_brand, data_source[elem["qsl_idx"]]["ground_truth_brand"])
 
-    for pred, src in data:
-        path1 = pred.split(separator)
-        path2 = src.split(separator)
 
-        y_pred_raw.append(path1)
-        y_true_raw.append(path2)
+def init_worker(dataset: dict) -> None:
+    """Initialize worker data to process each chunk.
 
-    # 2. Find the global maximum length across ALL samples
-    # We check the longest path in both true and pred lists
-    max_len = max(len(p) for p in y_true_raw + y_pred_raw)
+    Args:
+        dataset: huggingface dataset
+    """
+    _WORKER_CONTEXT["dataset"] = dataset
 
-    # 3. Pad all lists to the global max_len
-    for i in range(len(y_true_raw)):
-        # Pad Truth
-        pad_len_true = max_len - len(y_true_raw[i])
-        y_true_raw[i] += [_TRUE_CATEGORY_PAD] * pad_len_true
 
-        # Pad Prediction
-        pad_len_pred = max_len - len(y_pred_raw[i])
-        y_pred_raw[i] += [_PRED_CATEGORY_PAD] * pad_len_pred
+def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
+    """Retrieve relevant information from each chunk of data.
 
-    # 4. Convert to numpy arrays
-    y_true = np.array(y_true_raw)
-    y_pred = np.array(y_pred_raw)
+    Args:
+        args: Tuple that contains chunk of data and seed
 
-    # 5. Calculate Score
-    return f1(y_true, y_pred)
+    Returns:
+        Object with processed information
+    """
+    chunk_data, seed = args
 
+    # 1. Access the global dataset
+    dataset = _WORKER_CONTEXT["dataset"]
 
-def run_evaluation(random_seed: int, filename: FilePath,
-                   dataset: DatasetCLI) -> None:
-    """Main function to run the evaluation."""
-    rng = np.random.default_rng(seed=random_seed)
-    with Path.open(filename) as f:
-        model_output = json.load(f)
-
-    original_data = load_dataset(
-        dataset.repo_id,
-        token=dataset.token,
-        split="+".join(dataset.split),
-    )
+    # 2. Create a local, reproducible RNG for this specific chunk
+    local_rng = np.random.default_rng(seed)
 
     num_unparsable_responses = 0
     category_dataset_pred_src = []
@@ -223,13 +207,13 @@ def run_evaluation(random_seed: int, filename: FilePath,
     is_secondhand_pred_src = []
     is_secondhand_rand_pred_src = []
     brand_pred_src = []
-
     all_possible_brands = set()
+    error_messages = []
 
-    for elem in model_output:
+    for elem in chunk_data:
         idx = elem["qsl_idx"]
         response = bytes.fromhex(elem["data"]).decode("utf-8")
-        ground_truth_item = original_data[idx]
+        ground_truth_item = dataset[idx]
         all_possible_brands.add(ground_truth_item["ground_truth_brand"])
         try:
             pred_item = ProductMetadata.model_validate_json(response)
@@ -245,14 +229,15 @@ def run_evaluation(random_seed: int, filename: FilePath,
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=rng.choice([True, False], size=1).tolist()[0],
+                is_secondhand=local_rng.choice(
+                    [True, False], size=1).tolist()[0],
             )
-            logger.error(
-                "Response\n{}\n(for the sample at index {}) cannot be validated against"
-                " the expected schema. Overwriting this response into \n{}\n",
-                response,
-                idx,
-                pred_item,
+            error_messages.append(
+                (
+                    f"Response\n{response}\n(for the sample at index {idx})"
+                    f"cannot be validated against the expected schema. "
+                    f"Overwriting this response into \n{pred_item}\n",
+                ),
             )
         category_dataset_pred_src.append(
             (pred_item.category, ground_truth_item["ground_truth_category"]),
@@ -268,35 +253,122 @@ def run_evaluation(random_seed: int, filename: FilePath,
         )
         # random category selection
         # Uniform distribution is the default
-        rand_cat = rng.choice(
+        rand_cat = local_rng.choice(
             ground_truth_item["potential_product_categories"])
         category_rand_pred_src.append(
             (rand_cat, ground_truth_item["ground_truth_category"]),
         )
         # random is_secondhand selection
-        rand_is_secondhand = rng.choice([True, False])
+        rand_is_secondhand = local_rng.choice([True, False])
         is_secondhand_rand_pred_src.append(
             (rand_is_secondhand,
              ground_truth_item["ground_truth_is_secondhand"]),
         )
 
+    return {
+        "num_unparsable_responses": num_unparsable_responses,
+        "error_messages": error_messages,
+        "category_dataset_pred_src": category_dataset_pred_src,
+        "category_rand_pred_src": category_rand_pred_src,
+        "is_secondhand_pred_src": is_secondhand_pred_src,
+        "is_secondhand_rand_pred_src": is_secondhand_rand_pred_src,
+        "brand_pred_src": brand_pred_src,
+        "all_possible_brands": list(all_possible_brands),
+    }
+
+
+def run_evaluation(random_seed: int, filename: FilePath,
+                   dataset: DatasetCLI) -> None:
+    """Main function to run the evaluation."""
+    master_rng = np.random.default_rng(seed=random_seed)
+    with Path.open(filename) as f:
+        model_output = json.load(f)
+
+    original_data = load_dataset(
+        dataset.repo_id,
+        token=dataset.token,
+        split="+".join(dataset.split),
+    )
+
+    # get number of available CPU and get chunk size
+    cpu_count = min(os.cpu_count() or 1, _MAX_JOBS)
+    chunk_size = max(len(model_output) // cpu_count, 1)
+    # Create chunks
+    output_chunks = [
+        model_output[i: i + chunk_size]
+        for i in range(0, len(model_output), chunk_size)
+    ]
+
+    # Generate Seeds
+    # One seed per chunk to ensure reproducibility.
+    # The master_rng generates these,
+    # so the whole run is deterministic based on `random_seed`.
+    chunk_seeds = master_rng.integers(0, 2**32, size=len(output_chunks))
+
+    # Zip them: Each task is ([model_out_1, ...], 12345)
+    tasks = zip(output_chunks, chunk_seeds, strict=False)
+
+    num_unparsable_responses = 0
+    err_messages = []
+    category_dataset_pred_src = []
+    category_rand_pred_src = []
+    is_secondhand_pred_src = []
+    is_secondhand_rand_pred_src = []
+    brand_pred_src = []
+    all_possible_brands = []
+
+    with ProcessPoolExecutor(
+        max_workers=cpu_count,
+        initializer=init_worker,
+        initargs=(original_data,),
+    ) as executor:
+        # Execute
+        chunk_results = list(executor.map(_process_chunk, tasks))
+
+    for chunk in chunk_results:
+        num_unparsable_responses += chunk["num_unparsable_responses"]
+        err_messages.extend(chunk["error_messages"])
+        category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
+        category_rand_pred_src.extend(chunk["category_rand_pred_src"])
+        is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
+        is_secondhand_rand_pred_src.extend(
+            chunk["is_secondhand_rand_pred_src"])
+        brand_pred_src.extend(chunk["brand_pred_src"])
+        all_possible_brands.extend(chunk["all_possible_brands"])
+
+    for err in err_messages:
+        logger.error("{}", err)
+
     category_f1_score = calculate_hierarchical_f1(category_dataset_pred_src)
-    hiclass_f1_score = calculate_hiclass_f1(category_dataset_pred_src)
     is_secondhand_f1_score = calculate_secondhand_f1(is_secondhand_pred_src)
     brand_score = calculate_brand_f1_score(brand_pred_src)
 
     rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
-    rand_hiclass_f1_score = calculate_hiclass_f1(category_rand_pred_src)
+
     rand_is_seconhand_f1_score = calculate_secondhand_f1(
         is_secondhand_rand_pred_src)
+
+    all_brands_list = list(set(all_possible_brands))
+    random_brand_predictions = master_rng.choice(
+        all_brands_list,
+        size=len(model_output),
+    )
+
+    args_list = (
+        (pred, elem, original_data)
+        for pred, elem in zip(random_brand_predictions, model_output, strict=False)
+    )
+
+    with ProcessPoolExecutor() as executor:
+        rand_brand_data = list(
+            executor.map(
+                _process_chunk_rnd_brand,
+                args_list,
+                chunksize=chunk_size),
+        )
+
     rand_brand_score = calculate_brand_f1_score(
-        [
-            (
-                rng.choice(list(all_possible_brands)),
-                original_data[elem["qsl_idx"]]["ground_truth_brand"],
-            )
-            for elem in model_output
-        ],
+        rand_brand_data,
     )
 
     logger.info(
@@ -307,14 +379,12 @@ def run_evaluation(random_seed: int, filename: FilePath,
                 [
                     "From accuracy file",
                     category_f1_score,
-                    hiclass_f1_score,
                     brand_score,
                     is_secondhand_f1_score,
                 ],
                 [
                     "Random selection",
                     rand_cat_f1_score,
-                    rand_hiclass_f1_score,
                     rand_brand_score,
                     rand_is_seconhand_f1_score,
                 ],
@@ -322,7 +392,6 @@ def run_evaluation(random_seed: int, filename: FilePath,
             headers=[
                 "Results",
                 "Category hierarchical F1 Score",
-                "Category HiClass F1 Score",
                 "Brand F1 Score",
                 "Is_secondhand F1 Score",
             ],
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/log.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py
similarity index 95%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/log.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py
index a24eb514f0..56700ef7d8 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/log.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py
@@ -1,4 +1,4 @@
-"""Logging utilities for the VL2L benchmark."""
+"""Logging utilities for the Qwen3-VL (Q3VL) benchmark."""
 
 from __future__ import annotations
 
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/py.typed b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/py.typed
similarity index 100%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/py.typed
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/py.typed
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
new file mode 100644
index 0000000000..9462aeedd3
--- /dev/null
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
@@ -0,0 +1,867 @@
+"""Schema definitions of various data structures in the Qwen3-VL (Q3VL) benchmark."""
+
+from __future__ import annotations
+
+from datetime import timedelta
+from enum import StrEnum, auto
+from pathlib import Path
+from typing import Annotated, ClassVar, Self
+
+import mlperf_loadgen as lg
+from loguru import logger
+from openai.types import ResponseFormatJSONSchema
+from openai.types.chat import ChatCompletionMessageParam
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    DirectoryPath,
+    Field,
+    FilePath,
+    NonNegativeInt,
+    field_validator,
+    model_validator,
+)
+
+MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES = 100
+ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES = 1 * 1024 * 1024 * 1024  # 1GB
+
+
+class TestScenario(StrEnum):
+    """The test scenario for the MLPerf inference LoadGen."""
+
+    SERVER = auto()
+    """Run the benchmark in server/interactive scenario."""
+
+    OFFLINE = auto()
+    """Run the benchmark in offline/batch scenario."""
+
+    def to_lgtype(self) -> lg.TestScenario:
+        """Convert the test scenario to its corresponding LoadGen type."""
+        match self:
+            case TestScenario.SERVER:
+                return lg.TestScenario.Server
+            case TestScenario.OFFLINE:
+                return lg.TestScenario.Offline
+            case _:
+                raise UnknownTestScenarioValueError(self)
+
+    @staticmethod
+    def from_lgtype(lgtype: lg.TestScenario) -> TestScenario:
+        """Convert the LoadGen's test scenario to the TestScenario schema."""
+        match lgtype:
+            case lg.TestScenario.Server:
+                return TestScenario.SERVER
+            case lg.TestScenario.Offline:
+                return TestScenario.OFFLINE
+            case _:
+                raise UnknownTestScenarioValueError(lgtype)
+
+
+class UnknownTestScenarioValueError(ValueError):
+    """The exception raised when an unknown test scenario is encountered."""
+
+    def __init__(self, test_scenario: TestScenario | lg.TestScenario) -> None:
+        """Initialize the exception."""
+        super().__init__(f"Unknown test scenario: {test_scenario}")
+
+
+class TestMode(StrEnum):
+    """The test mode for the MLPerf inference LoadGen."""
+
+    PERFORMANCE_ONLY = auto()
+    """Run the benchmark to evaluate performance."""
+
+    ACCURACY_ONLY = auto()
+    """Run the benchmark to evaluate model quality."""
+
+    def to_lgtype(self) -> lg.TestMode:
+        """Convert the test mode to its corresponding LoadGen type."""
+        match self:
+            case TestMode.PERFORMANCE_ONLY:
+                return lg.TestMode.PerformanceOnly
+            case TestMode.ACCURACY_ONLY:
+                return lg.TestMode.AccuracyOnly
+            case _:
+                raise UnknownTestModeValueError(self)
+
+    @staticmethod
+    def from_lgtype(lgtype: lg.TestMode) -> TestMode:
+        """Convert the LoadGen's test mode to the TestMode schema."""
+        match lgtype:
+            case lg.TestMode.PerformanceOnly:
+                return TestMode.PERFORMANCE_ONLY
+            case lg.TestMode.AccuracyOnly:
+                return TestMode.ACCURACY_ONLY
+            case _:
+                raise UnknownTestModeValueError(lgtype)
+
+
+class UnknownTestModeValueError(ValueError):
+    """The exception raised when an unknown test mode is encountered."""
+
+    def __init__(self, test_mode: TestMode | lg.TestMode) -> None:
+        """Initialize the exception."""
+        super().__init__(f"Unknown test mode: {test_mode}")
+
+
+class LoggingMode(StrEnum):
+    """Specifies when logging should be sampled and stringified."""
+
+    ASYNC_POLL = auto()
+    """ Logs are serialized and output on an IOThread that polls for new logs
+      at a fixed interval. This is the only mode currently implemented."""
+
+    END_OF_TEST_ONLY = auto()
+    """ Not implemented """
+
+    SYNCHRONOUS = auto()
+    """ Not implemented """
+
+    def to_lgtype(self) -> lg.LoggingMode:
+        """Convert logging mode to its corresponding LoadGen type."""
+        match self:
+            case LoggingMode.ASYNC_POLL:
+                return lg.LoggingMode.AsyncPoll
+            case _:
+                raise UnknownLoggingModeValueError(self)
+
+
+class UnknownLoggingModeValueError(ValueError):
+    """The exception raised when an unknown logging mode is encountered."""
+
+    def __init__(self, logging_mode: LoggingMode) -> None:
+        """Initialize the exception."""
+        super().__init__(f"Unknown logging mode: {logging_mode}")
+
+
+class BaseModelWithAttributeDescriptionsFromDocstrings(BaseModel):
+    """Base model that automatically adds attribute descriptions from docstrings."""
+
+    model_config = ConfigDict(use_attribute_docstrings=True, extra="forbid")
+    """Pydantic settings for
+    - Automatically add the attribute descriptions from docstrings.
+    - Forbid extra attributes.
+    """
+
+
+_DEFAULT_DATASET_SIZE = 48289
+_DEFAULT_MIN_DURATION = timedelta(minutes=10)
+_DEFAULT_OFFLINE_EXPECTED_QPS = (
+    _DEFAULT_DATASET_SIZE / _DEFAULT_MIN_DURATION.total_seconds()
+)
+
+
+class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """The test settings for the MLPerf inference LoadGen."""
+
+    scenario: TestScenario = TestScenario.OFFLINE
+    """The MLPerf inference benchmarking scenario to run the benchmark in."""
+
+    mode: TestMode = TestMode.PERFORMANCE_ONLY
+    """Whether you want to run the benchmark for performance measurement or accuracy
+    evaluation.
+    """
+
+    """Server-specific settings"""
+
+    server_target_qps: float = 5
+    """The average QPS of the poisson distribution. Note: This field is used as a
+    FindPeakPerformance's lower bound. When you run FindPeakPerformanceMode, you should
+    make sure that this value satisfies performance constraints.
+    """
+
+    server_target_latency: timedelta = timedelta(seconds=12)
+    """The latency constraint for the Server scenario."""
+
+    server_target_latency_percentile: float = 0.99
+    """The latency percentile for server mode. This value is combined with
+    server_target_latency to determine if a run is valid.
+    """
+
+    server_coalesce_queries: bool = False
+    """If this flag is set to True, LoadGen will combine samples from
+    multiple queries into a single query if their scheduled issue times have
+    passed.
+    """
+
+    server_find_peak_qps_decimals_of_precision: int = 1
+    """The decimal places of QPS precision used to terminate
+    FindPeakPerformance mode.
+    """
+
+    server_find_peak_qps_boundary_step_size: float = 1
+    """The step size (as a fraction of the QPS) used to widen the lower and
+    upper bounds to find the initial boundaries of binary search.
+    """
+
+    server_max_async_queries: int = 0
+    """The maximum number of outstanding queries to allow before earlying out from a
+    performance run. Useful for performance tuning and speeding up the
+    FindPeakPerformance mode.
+    """
+
+    server_num_issue_query_threads: int = 0
+    """The number of issue query threads that will be registered and used
+    to call SUT's IssueQuery(). If this is 0, the same thread calling
+    StartTest() will be used to call IssueQuery(). See also
+    mlperf::RegisterIssueQueryThread().
+    """
+
+    """Offline-specific settings"""
+
+    offline_expected_qps: float = _DEFAULT_OFFLINE_EXPECTED_QPS
+    """Specifies the QPS the SUT expects to hit for the offline load.
+    The LoadGen generates 10% more queries than it thinks it needs to meet
+    the minimum test duration.
+    """
+
+    sample_concatenate_permutation: bool = True
+    """Affects the order in which the samples of the dataset are chosen.
+    If False, it concatenates a single permutation of the dataset (or part
+    of it depending on performance_sample_count_override) several times up to the
+    number of samples requested.
+    If True, it concatenates a multiple permutation of the dataset (or a
+    part of it depending on `performance_sample_count_override`) several times
+    up to the number of samples requested.
+    """
+
+    """Test duration settings"""
+
+    min_duration: timedelta = _DEFAULT_MIN_DURATION
+    """The minimum testing duration (in seconds or ISO 8601 format like `PT5S`). The
+    benchmark runs until this value has been met.
+    """
+
+    max_duration: timedelta = timedelta(seconds=0)
+    """The maximum testing duration (in seconds or ISO 8601 format like `PT5S`). The
+    benchmark will exit before this value has been met. 0 means infinity.
+    """
+
+    min_query_count: int = _DEFAULT_DATASET_SIZE
+    """The minimum testing query count. The benchmark runs until this value has been
+    met. If min_query_count is less than the total number of samples in the dataset,
+    only the first min_query_count samples will be used during testing.
+    """
+
+    max_query_count: int = 0
+    """The maximum testing query count. The benchmark will exit before this value has
+    been met. 0 means infinity.
+    """
+
+    """Random number generation settings"""
+
+    qsl_rng_seed: int = 0
+    """Affects which subset of samples from the QSL are chosen for
+    the performance sample set and accuracy sample sets."""
+
+    sample_index_rng_seed: int = 0
+    """Affects the order in which samples from the performance set will
+    be included in queries."""
+
+    schedule_rng_seed: int = 0
+    """Affects the poisson arrival process of the Server scenario.
+    Different seeds will appear to "jitter" the queries
+    differently in time, but should not affect the average issued QPS.
+    """
+
+    accuracy_log_rng_seed: int = 0
+    """Affects which samples have their query returns logged to the
+    accuracy log in performance mode."""
+
+    accuracy_log_probability: float = 0.0
+    """The probability of the query response of a sample being logged to the
+    accuracy log in performance mode."""
+
+    accuracy_log_sampling_target: int = 0
+    """The target number of samples that will have their results printed to
+    accuracy log in performance mode for compliance testing."""
+
+    """Test05 settings"""
+
+    test05: bool = False
+    """Whether or not to run test05."""
+
+    test05_qsl_rng_seed: int = 0
+    """Test05 seed for which subset of samples from the QSL are chosen for
+    the performance sample set and accuracy sample sets."""
+
+    test05_sample_index_rng_seed: int = 0
+    """Test05 seed for the order in which samples from the performance set will
+    be included in queries."""
+
+    test05_schedule_rng_seed: int = 0
+    """Test05 seed for the poisson arrival process of the Server scenario.
+    Different seeds will appear to "jitter" the queries
+    differently in time, but should not affect the average issued QPS.
+    """
+
+    """Performance Sample modifiers"""
+
+    print_timestamps: bool = False
+    """Prints measurement interval start and stop timestamps to stdout
+    for the purpose of comparison against an external timer."""
+
+    performance_issue_unique: bool = False
+    """Allows issuing only unique queries in Performance mode of any
+    scenario. This can be used to send non-repeat & hence unique
+    samples to SUT.
+    """
+
+    performance_issue_same: bool = False
+    """If True, the same query is chosen repeatedley for Inference.
+    In offline scenario, the query is filled with the same sample.
+    """
+
+    performance_issue_same_index: int = 0
+    """Offset to control which sample is repeated in
+    performance_issue_same mode. Value should be within [0, performance_sample_count).
+    """
+
+    performance_sample_count_override: Annotated[
+        NonNegativeInt,
+        Field(
+            description="The number of samples to use for the performance test. In the "  # noqa: S608
+            "performance mode, the benchmark will select P random samples from the "
+            "dataset, then send enough queries using these P samples (and repeating "
+            "them if necessary) to reach the min_duration and min_query_count. If a "
+            "non-zero value is passed to this flag, the P will be this value. "
+            "Otherwise, the benchmark will estimate how many samples can be loaded into"
+            f" {ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES} bytes of memory "
+            "based on the memory footprint of randomly selected "
+            f"{MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES} samples (at most), and then"
+            " use this estimation as the value P.",
+        ),
+    ] = _DEFAULT_DATASET_SIZE
+
+    use_token_latencies: bool = False
+    """By default, the Server scenario will use `server_target_latency` as the
+    constraint. When set to True, the Server scenario will use `server_ttft_latency` and
+    `server_tpot_latency` as the constraint.
+    """
+
+    server_ttft_latency: timedelta = timedelta(milliseconds=100)
+    """Time to First Token (TTFT) latency constraint result validation (used when
+    use_token_latencies is enabled).
+    """
+
+    server_tpot_latency: timedelta = timedelta(milliseconds=100)
+    """Time per Output Token (TPOT) latency constraint result validation (used when
+    use_token_latencies is enabled).
+    """
+
+    infer_token_latencies: bool = False
+    """Infer token latencies from the response time."""
+
+    token_latency_scaling_factor: int = 1
+    """Only used when infer_token_latencies is enabled. The scaling factor inferring
+    token latencies from the response time.
+    """
+
+    @field_validator(
+        "server_target_latency",
+        "min_duration",
+        "max_duration",
+        "server_ttft_latency",
+        "server_tpot_latency",
+        mode="before",
+    )
+    @classmethod
+    def parse_timedelta(cls, value: timedelta | float |
+                        str) -> timedelta | str:
+        """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
+        if isinstance(value, timedelta):
+            return value
+        if isinstance(value, (int, float)):
+            return timedelta(seconds=value)
+        if isinstance(value, str):
+            # Try to parse as a number first
+            try:
+                return timedelta(seconds=float(value))
+            except ValueError:
+                # If it fails, it might be ISO 8601 format
+                # Let pydantic's default parser handle it
+                pass
+        return value
+
+    def to_lgtype(self) -> lg.TestSettings:
+        """Convert the test settings to its corresponding LoadGen type."""
+        settings = lg.TestSettings()
+        settings.scenario = self.scenario.to_lgtype()
+        settings.mode = self.mode.to_lgtype()
+
+        # Server-specific settings
+        settings.server_target_qps = self.server_target_qps
+        settings.server_target_latency_ns = round(
+            self.server_target_latency.total_seconds() * 1e9,
+        )
+        settings.server_target_latency_percentile = (
+            self.server_target_latency_percentile
+        )
+        settings.server_coalesce_queries = self.server_coalesce_queries
+        settings.server_find_peak_qps_decimals_of_precision = (
+            self.server_find_peak_qps_decimals_of_precision
+        )
+        settings.server_find_peak_qps_boundary_step_size = (
+            self.server_find_peak_qps_boundary_step_size
+        )
+        settings.server_max_async_queries = self.server_max_async_queries
+        settings.server_num_issue_query_threads = self.server_num_issue_query_threads
+
+        # Offline-specific settings
+        settings.offline_expected_qps = self.offline_expected_qps
+        settings.sample_concatenate_permutation = self.sample_concatenate_permutation
+
+        # Test duration settings
+        settings.min_duration_ms = round(
+            self.min_duration.total_seconds() * 1000)
+        settings.max_duration_ms = round(
+            self.max_duration.total_seconds() * 1000)
+        settings.min_query_count = self.min_query_count
+        settings.max_query_count = self.max_query_count
+
+        # Random number generation settings
+        settings.qsl_rng_seed = self.qsl_rng_seed
+        settings.sample_index_rng_seed = self.sample_index_rng_seed
+        settings.schedule_rng_seed = self.schedule_rng_seed
+        settings.accuracy_log_rng_seed = self.accuracy_log_rng_seed
+        settings.accuracy_log_probability = self.accuracy_log_probability
+        settings.accuracy_log_sampling_target = self.accuracy_log_sampling_target
+
+        # Test05 settings
+        settings.test05 = self.test05
+        settings.test05_qsl_rng_seed = self.test05_qsl_rng_seed
+        settings.test05_sample_index_rng_seed = self.test05_sample_index_rng_seed
+        settings.test05_schedule_rng_seed = self.test05_schedule_rng_seed
+
+        # Performance Sample modifiers
+        settings.print_timestamps = self.print_timestamps
+        settings.performance_issue_unique = self.performance_issue_unique
+        settings.performance_issue_same = self.performance_issue_same
+        settings.performance_issue_same_index = self.performance_issue_same_index
+        settings.performance_sample_count_override = (
+            self.performance_sample_count_override
+        )
+        settings.use_token_latencies = self.use_token_latencies
+        settings.ttft_latency = round(
+            self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(
+            self.server_tpot_latency.total_seconds() * 1e9)
+        settings.infer_token_latencies = self.infer_token_latencies
+        settings.token_latency_scaling_factor = self.token_latency_scaling_factor
+
+        return settings
+
+    @staticmethod
+    def from_lgtype(lgtype: lg.TestSettings) -> TestSettings:
+        """Convert the LoadGen's test settings to the TestSettings schema."""
+        return TestSettings(
+            scenario=TestScenario.from_lgtype(lgtype.scenario),
+            mode=TestMode.from_lgtype(lgtype.mode),
+            server_target_qps=lgtype.server_target_qps,
+            server_target_latency=timedelta(
+                seconds=lgtype.server_target_latency_ns / 1e9,
+            ),
+            server_target_latency_percentile=lgtype.server_target_latency_percentile,
+            server_coalesce_queries=lgtype.server_coalesce_queries,
+            server_find_peak_qps_decimals_of_precision=lgtype.server_find_peak_qps_decimals_of_precision,
+            server_find_peak_qps_boundary_step_size=lgtype.server_find_peak_qps_boundary_step_size,
+            server_max_async_queries=lgtype.server_max_async_queries,
+            server_num_issue_query_threads=lgtype.server_num_issue_query_threads,
+            offline_expected_qps=lgtype.offline_expected_qps,
+            sample_concatenate_permutation=lgtype.sample_concatenate_permutation,
+            min_duration=timedelta(milliseconds=lgtype.min_duration_ms),
+            max_duration=timedelta(milliseconds=lgtype.max_duration_ms),
+            min_query_count=lgtype.min_query_count,
+            max_query_count=lgtype.max_query_count,
+            qsl_rng_seed=lgtype.qsl_rng_seed,
+            sample_index_rng_seed=lgtype.sample_index_rng_seed,
+            schedule_rng_seed=lgtype.schedule_rng_seed,
+            accuracy_log_rng_seed=lgtype.accuracy_log_rng_seed,
+            accuracy_log_probability=lgtype.accuracy_log_probability,
+            accuracy_log_sampling_target=lgtype.accuracy_log_sampling_target,
+            test05=lgtype.test05,
+            test05_qsl_rng_seed=lgtype.test05_qsl_rng_seed,
+            test05_sample_index_rng_seed=lgtype.test05_sample_index_rng_seed,
+            test05_schedule_rng_seed=lgtype.test05_schedule_rng_seed,
+            print_timestamps=lgtype.print_timestamps,
+            performance_issue_unique=lgtype.performance_issue_unique,
+            performance_issue_same=lgtype.performance_issue_same,
+            performance_issue_same_index=lgtype.performance_issue_same_index,
+            performance_sample_count_override=lgtype.performance_sample_count_override,
+            use_token_latencies=lgtype.use_token_latencies,
+            server_ttft_latency=timedelta(seconds=lgtype.ttft_latency / 1e9),
+            server_tpot_latency=timedelta(seconds=lgtype.tpot_latency / 1e9),
+            infer_token_latencies=lgtype.infer_token_latencies,
+            token_latency_scaling_factor=lgtype.token_latency_scaling_factor,
+        )
+
+
+class LogOutputSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """The test log output settings for the MLPerf inference LoadGen."""
+
+    outdir: DirectoryPath = DirectoryPath("./output")
+    """Where to save the output files from the benchmark."""
+
+    prefix: str = "mlperf_log_"
+    """Modify the filenames of the logs with a prefix."""
+
+    suffix: str = ""
+    """Modify the filenames of the logs with a suffix."""
+
+    prefix_with_datetime: bool = False
+    """Modify the filenames of the logs with a datetime."""
+
+    copy_detail_to_stdout: bool = False
+    """Print details of performance test to stdout."""
+
+    copy_summary_to_stdout: bool = True
+    """Print results of performance test to terminal."""
+
+    @field_validator("outdir", mode="before")
+    @classmethod
+    def parse_directory_field(cls, value: str) -> Path:
+        """Verify and create the output directory to store log files."""
+        path = Path(value)
+        path.mkdir(exist_ok=True)
+        return path
+
+    def to_lgtype(self) -> lg.LogOutputSettings:
+        """Convert the log output settings to its corresponding LoadGen type."""
+        log_output_settings = lg.LogOutputSettings()
+        log_output_settings.outdir = self.outdir.as_posix()
+        log_output_settings.prefix = self.prefix
+        log_output_settings.suffix = self.suffix
+        log_output_settings.prefix_with_datetime = self.prefix_with_datetime
+        log_output_settings.copy_detail_to_stdout = self.copy_detail_to_stdout
+        log_output_settings.copy_summary_to_stdout = self.copy_summary_to_stdout
+        return log_output_settings
+
+
+class LogSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """The test log settings for the MLPerf inference LoadGen."""
+
+    log_output: LogOutputSettings = LogOutputSettings()
+    """Log output settings"""
+
+    log_mode: LoggingMode = LoggingMode.ASYNC_POLL
+    """How and when logging should be sampled and stringified at runtime"""
+
+    enable_trace: bool = True
+    """Enable trace"""
+
+    def to_lgtype(self) -> lg.LogSettings:
+        """Convert log settings to its corresponding LoadGen type."""
+        log_settings = lg.LogSettings()
+        log_settings.log_output = self.log_output.to_lgtype()
+        log_settings.log_mode = self.log_mode.to_lgtype()
+        log_settings.enable_trace = self.enable_trace
+        return log_settings
+
+
+class UserConf(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """The user.conf file for specifying LoadGen test settings."""
+
+    path: FilePath | None = None
+    """The path to the user.conf file. If provided, the test settings will be overridden
+    with the settings from the provided user.conf file and the mlperf.conf file from
+    inside LoadGen.
+    """
+
+    model: str = "qwen3-vl-235b-a22b"
+    """The model name that corresponds to the entries in the mlperf.conf file (in the
+    LoadGen) which defines the benchmark-wide constraints.
+    """
+
+
+class Settings(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """Combine the settings for the test and logging of LoadGen."""
+
+    test: TestSettings
+    """Test settings parameters."""
+
+    user_conf: UserConf
+    """The user.conf file for specifying LoadGen test settings."""
+
+    logging: LogSettings
+    """Test logging parameters."""
+
+    @model_validator(mode="after")
+    def override_test_settings_from_user_conf(self) -> Self:
+        """Override the test settings from the user.conf file."""
+        if self.user_conf.path:
+            lg_test_settings = self.test.to_lgtype()
+            lg_test_settings.FromConfig(
+                str(self.user_conf.path),
+                self.user_conf.model,
+                self.test.scenario.value.capitalize(),
+            )
+            self.test = TestSettings.from_lgtype(lg_test_settings)
+            logger.info(
+                "Loaded test settings from the user.conf and mlperf.conf files: {}",
+                self.test,
+            )
+        return self
+
+    def to_lgtype(self) -> tuple[lg.TestSettings, lg.LogSettings]:
+        """Return test and log settings for LoadGen."""
+        test_settings = self.test.to_lgtype()
+        log_settings = self.logging.to_lgtype()
+        return (test_settings, log_settings)
+
+
+class Model(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """Specifies the model to use for the Qwen3-VL (Q3VL) benchmark."""
+
+    repo_id: str = "Qwen/Qwen3-VL-235B-A22B-Instruct"
+    """The HuggingFace repository ID of the model."""
+
+    token: str | None = None
+    """The token to access the HuggingFace repository of the model."""
+
+    revision: str = "710c13861be6c466e66de3f484069440b8f31389"
+    """The revision of the model."""
+
+
+class Dataset(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """Specifies a dataset on HuggingFace."""
+
+    repo_id: str = "Shopify/product-catalogue"
+    """The HuggingFace repository ID of the dataset."""
+
+    token: str | None = None
+    """The token to access the HuggingFace repository of the dataset."""
+
+    revision: str = "d5c517c509f5aca99053897ef1de797d6d7e5aa5"
+    """The revision of the dataset."""
+
+    split: list[str] = ["train", "test"]
+    """Dataset splits to use for the benchmark, e.g., "train" and "test". You can add
+    multiple splits by repeating the same CLI flag multiple times, e.g.:
+    --dataset.split test --dataset.split train
+    The testing dataset is a concatenation of these splits in the same order.
+    """
+
+
+class Verbosity(StrEnum):
+    """The verbosity level of the logger."""
+
+    TRACE = auto()
+    """The trace verbosity level."""
+
+    DEBUG = auto()
+    """The debug verbosity level."""
+
+    INFO = auto()
+    """The info verbosity level (default)."""
+
+
+class SamplingParams(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """Specifies the sampling parameters for the inference request to the endpoint."""
+
+    frequency_penalty: float = 0.0
+    """Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+    existing frequency in the text so far, decreasing the model's likelihood to repeat
+    the same line verbatim. See
+    https://platform.openai.com/docs/api-reference/chat/create#chat_create-frequency_penalty
+    """
+
+    presence_penalty: float = 0.0
+    """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether
+    they appear in the text so far, increasing the model's likelihood to talk about new
+    topics. See
+    https://platform.openai.com/docs/api-reference/chat/create#chat_create-presence_penalty
+    """
+
+    temperature: float = 1.0
+    """What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+    make the output more random, while lower values like 0.2 will make it more focused
+    and deterministic. We generally recommend altering this or top_p but not both. See
+    https://platform.openai.com/docs/api-reference/chat/create#chat_create-temperature
+    """
+
+    top_p: float = 1.0
+    """An alternative to sampling with temperature, called nucleus sampling, where the
+    model considers the results of the tokens with top_p probability mass. So 0.1 means
+    only the tokens comprising the top 10% probability mass are considered. We generally
+    recommend altering this or temperature but not both.
+    See https://platform.openai.com/docs/api-reference/chat/create#chat_create-top_p
+    """
+
+    top_k: int = 0
+    """Controls the number of top tokens to consider. Set to 0 (or -1) to
+    consider all tokens.
+    Note that this is not part of the OpenAI API spec. Therefore, this field will be
+    passed in via the `extra_body` field of the inference request to the endpoint.
+    The inference engine therefore needs to support this field, such as what vLLM does
+    here:
+    https://github.com/vllm-project/vllm/blob/83a317f650f210b86572b13b8198b7d38aaacb7e/vllm/entrypoints/openai/protocol.py#L566
+    """
+
+    min_p: float = 0.0
+    """Represents the minimum probability for a token to be considered,
+    relative to the probability of the most likely token. Must be in [0, 1].
+    Set to 0 to disable this.
+    Note that this is not part of the OpenAI API spec. Therefore, this field will be
+    passed in via the `extra_body` field of the inference request to the endpoint.
+    The inference engine therefore needs to support this field, such as what vLLM does
+    here:
+    https://github.com/vllm-project/vllm/blob/83a317f650f210b86572b13b8198b7d38aaacb7e/vllm/entrypoints/openai/protocol.py#L567
+    """
+
+    repetition_penalty: float = 1.0
+    """Penalizes new tokens based on whether they appear in the prompt and the
+    generated text so far. Values > 1 encourage the model to use new tokens,
+    while values < 1 encourage the model to repeat tokens.
+    Note that this is not part of the OpenAI API spec. Therefore, this field will be
+    passed in via the `extra_body` field of the inference request to the endpoint.
+    The inference engine therefore needs to support this field, such as what vLLM does
+    here:
+    https://github.com/vllm-project/vllm/blob/83a317f650f210b86572b13b8198b7d38aaacb7e/vllm/entrypoints/openai/protocol.py#L568
+    """
+
+
+class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """Specifies the OpenAI API endpoint to use for the Qwen3-VL (Q3VL) benchmark."""
+
+    url: str = "http://localhost:8000/v1"
+    """The URL of the OpenAI API endpoint that the inference requests are sent to."""
+
+    api_key: str = ""
+    """The API key to authenticate the inference requests."""
+
+    model: Model
+    """The model to use for the Qwen3-VL (Q3VL) benchmark, i.e., the model that was
+    deployed behind this OpenAI API endpoint.
+    """
+
+    use_guided_decoding: bool = False
+    """If True, the benchmark will enable guided decoding for the requests. This
+    requires the endpoint (and the inference engine behind it) to support guided
+    decoding. If False, the response from the endpoint might not be directly parsable
+    by the response JSON schema (e.g., the JSON object might be fenced in a
+    ```json ... ``` code block).
+    """
+
+    request_timeout: timedelta = timedelta(hours=2)
+    """The timeout for the inference request to the endpoint. The default value for
+    OpenAI API client is 10 minutes
+    (https://github.com/openai/openai-python?tab=readme-ov-file#timeouts) which might
+    not be sufficient for the offline scenario.
+    """
+
+    sampling_params: SamplingParams
+    """The sampling parameters to use for the inference request to the endpoint."""
+
+
+class EndpointToDeploy(Endpoint):
+    """Specifies the endpoint to deploy for the Qwen3-VL (Q3VL) benchmark."""
+
+    startup_timeout: timedelta = timedelta(hours=1)
+    """The timeout for the endpoint to start up."""
+
+    shutdown_timeout: timedelta = timedelta(minutes=1)
+    """The timeout for the endpoint to shut down."""
+
+    poll_interval: timedelta = timedelta(seconds=60)
+    """The interval to poll the endpoint for readiness."""
+
+    healthcheck_timeout: timedelta = timedelta(seconds=5)
+    """The timeout for the healthcheck request to the endpoint."""
+
+
+class VllmEndpoint(EndpointToDeploy):
+    """Specifies how to deploy an OpenAI API endpoint in vLLM for benchmarking."""
+
+    cli: list[str] = []
+    """The CLI arguments to pass to `vllm serve`. This excludes vllm's `--host`,
+    `--port`, --api-key` and `--model` CLI arguments which will be determined by
+    the `url`, `api_key` and `model` fields of this schema."""
+
+    @model_validator(mode="after")
+    def validate_cli(self) -> Self:
+        """Validate the vllm CLI arguments."""
+        for flag in self.cli:
+            if not flag.startswith(("--", "-")):
+                raise PositionalVllmCliFlagError(flag)
+            if flag.split("=", 1)[0] in BlacklistedVllmCliFlagError.BLACKLIST:
+                raise BlacklistedVllmCliFlagError(flag)
+        return self
+
+
+class PositionalVllmCliFlagError(ValueError):
+    """The exception raised when a positional vllm CLI flag is encountered."""
+
+    def __init__(self, flag: str) -> None:
+        """Initialize the exception."""
+        super().__init__(
+            f"Positional vllm CLI flag: {flag} is not allowed. Only optional flags are "
+            "allowed to be passed to `--vllm.cli`.",
+        )
+
+
+class BlacklistedVllmCliFlagError(ValueError):
+    """The exception raised when a blacklisted vllm CLI flag is encountered."""
+
+    BLACKLIST: ClassVar[list[str]] = [
+        "--model",
+        "--revision",
+        "--host",
+        "--port",
+        "--hf-token",
+        "--api-key",
+    ]
+
+    def __init__(self, flag: str) -> None:
+        """Initialize the exception."""
+        super().__init__(
+            f"Blacklisted vllm CLI flag: {flag} is not allowed. The blacklisted flags"
+            f"are {self.BLACKLIST}.",
+        )
+
+
+class ProductMetadata(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """Json format for the expected responses from the VLM."""
+
+    category: str
+    """The complete category of the product, e.g.,
+    "Clothing & Accessories > Clothing > Shirts > Polo Shirts".
+    Each categorical level is separated by " > ".
+    """
+
+    brand: str
+    """The brand of the product, e.g., "giorgio armani"."""
+
+    is_secondhand: bool
+    """True if the product is second-hand, False otherwise."""
+
+
+class LoadedSample(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """Sample format to be used by LoadGen."""
+
+    messages: list[ChatCompletionMessageParam]
+    """The messages to be sent for chat completion to the VLM inference endpoint."""
+
+    response_format: ResponseFormatJSONSchema | None = None
+    """The response format to be used during guided decoding."""
+
+    @field_validator("messages", mode="after")
+    @classmethod
+    def ensure_content_is_list(
+        cls,
+        messages: list[ChatCompletionMessageParam],
+    ) -> list[ChatCompletionMessageParam]:
+        """If the content is a `ValidatorIterator`, convert it back to a list.
+
+        This is to workaround a Pydantic bug. See
+        https://github.com/pydantic/pydantic/issues/9467 for more details.
+        """
+        for message in messages:
+            if (
+                "content" in message
+                and message["content"].__class__.__module__
+                == "pydantic_core._pydantic_core"
+                and message["content"].__class__.__name__ == "ValidatorIterator"
+            ):
+                message["content"] = list(
+                    message["content"])  # type: ignore[arg-type]
+        return messages
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
similarity index 88%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
index c63a690a32..ddcd962ea1 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
@@ -1,4 +1,4 @@
-"""Task definitions for the VL2L benchmark."""
+"""Task definitions for the Qwen3-VL (Q3VL) benchmark."""
 
 from __future__ import annotations
 
@@ -13,6 +13,7 @@
 from io import BytesIO
 from typing import Any
 
+import httpx
 import mlperf_loadgen as lg
 from datasets import load_dataset
 from loguru import logger
@@ -56,17 +57,21 @@ def __init__(
             revision=dataset.revision,
             split="+".join(dataset.split),
         )
-        logger.debug(
-            "Loaded {} samples from the dataset splits {}.",
+        logger.info(
+            "Imported {} samples from the dataset splits {}.",
             len(self.dataset),
             dataset.split,
         )
         self.endpoint = endpoint
+        request_timeout_seconds = endpoint.request_timeout.total_seconds()
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
-            http_client=DefaultAioHttpClient(),
+            http_client=DefaultAioHttpClient(
+                timeout=httpx.Timeout(
+                    timeout=request_timeout_seconds, connect=5.0),
+            ),
             api_key=endpoint.api_key,
-            timeout=endpoint.request_timeout.total_seconds(),
+            timeout=request_timeout_seconds,
         )
         self.event_loop, self.event_loop_thread = (
             self._create_event_loop_in_separate_thread()
@@ -98,7 +103,7 @@ async def _cancel_all_tasks() -> None:
                 _cancel_all_tasks(),
                 self.event_loop,
             ).result(timeout=5.0)
-        except Exception as e:
+        except Exception as e:  # noqa: BLE001
             logger.trace("Error cancelling tasks during cleanup: {}", e)
 
         # Try to close the OpenAI client gracefully
@@ -107,7 +112,7 @@ async def _cancel_all_tasks() -> None:
                 self.openai_api_client.close(),
                 self.event_loop,
             ).result(timeout=5.0)
-        except Exception as e:
+        except Exception as e:  # noqa: BLE001
             logger.trace("Error closing OpenAI client during cleanup: {}", e)
 
         # Stop the event loop and join the thread
@@ -204,8 +209,9 @@ def estimated_num_performance_samples(self) -> int:
             self.total_num_samples,
         )
         logger.debug(
-            "Estimated number of performance samples that will be loaded into the host"
+            "Estimated number of performance samples that can be loaded into {} GB host"
             " memory before testing is {}.",
+            ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES / 1024 / 1024 / 1024,
             result,
         )
         if self.settings.performance_sample_count_override > 0:
@@ -226,11 +232,22 @@ def _load_samples_to_ram(query_sample_indices: list[int]) -> None:
             Args:
                 query_sample_indices: The indices of the samples to load to host memory.
             """
+            logger.info(
+                "Starting to load {} samples to RAM...",
+                len(query_sample_indices),
+            )
+            tic = time.perf_counter()
             for index in query_sample_indices:
                 self.loaded_samples[index] = self.formulate_loaded_sample(
                     self.dataset[index],
                     use_guided_decoding=self.endpoint.use_guided_decoding,
                 )
+            logger.info(
+                "Loaded {} samples to RAM, which took {} seconds and {} GB in total.",
+                len(query_sample_indices),
+                time.perf_counter() - tic,
+                asizeof.asizeof(self.loaded_samples) / 1024 / 1024 / 1024,
+            )
 
         def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             """Called by LoadGen to unload samples from host memory after testing.
@@ -239,9 +256,19 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
                 query_sample_indices: The indices of the samples to unload from host
                     memory.
             """
+            logger.info(
+                "Starting to unload {} samples from RAM...",
+                len(query_sample_indices),
+            )
+            tic = time.perf_counter()
             for index in query_sample_indices:
                 sample_to_unload = self.loaded_samples.pop(index, None)
                 del sample_to_unload
+            logger.info(
+                "Unloaded {} samples from RAM, which took {} seconds.",
+                len(query_sample_indices),
+                time.perf_counter() - tic,
+            )
 
         return lg.ConstructQSL(
             self.total_num_samples,
@@ -279,6 +306,17 @@ async def _query_endpoint_async_batch(
                     if sample.response_format is not None
                     else None
                 ),
+                frequency_penalty=self.endpoint.sampling_params.frequency_penalty,
+                presence_penalty=self.endpoint.sampling_params.presence_penalty,
+                temperature=self.endpoint.sampling_params.temperature,
+                top_p=self.endpoint.sampling_params.top_p,
+                extra_body={
+                    "top_k": self.endpoint.sampling_params.top_k,
+                    "min_p": self.endpoint.sampling_params.min_p,
+                    "repetition_penalty": (
+                        self.endpoint.sampling_params.repetition_penalty
+                    ),
+                },
             )
             logger.debug(
                 "Received response (ID: {}) from endpoint after {} seconds.",
@@ -360,6 +398,17 @@ async def _query_endpoint_async_stream(
                     if sample.response_format is not None
                     else None
                 ),
+                frequency_penalty=self.endpoint.sampling_params.frequency_penalty,
+                presence_penalty=self.endpoint.sampling_params.presence_penalty,
+                temperature=self.endpoint.sampling_params.temperature,
+                top_p=self.endpoint.sampling_params.top_p,
+                extra_body={
+                    "top_k": self.endpoint.sampling_params.top_k,
+                    "min_p": self.endpoint.sampling_params.min_p,
+                    "repetition_penalty": (
+                        self.endpoint.sampling_params.repetition_penalty
+                    ),
+                },
             )
             # iterate asynchronously
             total_tokens = 0
@@ -472,6 +521,10 @@ def _issue_queries(query_samples: list[lg.QuerySample]) -> None:
 
         def _flush_queries() -> None:
             """Called by the LoadGen to indicate that all queries have been issued."""
+            logger.info(
+                "LoadGen has indicated that all queries have been issued. "
+                "Waiting for all pending queries to complete...",
+            )
 
             async def _wait_for_pending_queries_async() -> None:
                 """Wait for all pending queries to complete."""
@@ -494,6 +547,7 @@ async def _wait_for_pending_queries_async() -> None:
                 self.event_loop,
             )
             future.result()
+            logger.info("All pending queries has completed.")
 
         return lg.ConstructSUT(_issue_queries, _flush_queries)
 
diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md
deleted file mode 100644
index 5720fb7fd7..0000000000
--- a/multimodal/vl2l/README.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# Reference Implementation for the Vision-language-to-language (VL2L) Benchmark 
-
-## Quick Start
-
-This guide demonstrates how you can run the benchmark on your local machine.
-
-### Create a Conda environment
-
-Follow [this link](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions)
-on how to install Miniconda on your host machine. Then, you can create a new conda 
-environment via:
-
-```bash
-conda create -n mlperf-inf-mm-vl2l python=3.12
-```
-
-### Install the VL2L benchmarking CLI
-
-#### For users
-
-Install `mlperf-inf-mm-vl2l` with:
-
-```bash
-pip install git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/vl2l/
-```
-
-#### For developers
-
-Clone the MLPerf Inference repo via:
-
-```bash
-git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf-inference
-```
-
-Then enter the repo: 
-
-```bash
-cd mlperf-inference/
-```
-
-Install `mlperf-inf-mm-vl2l` and the development tools with:
-
-- On Bash
-```bash
-pip install -e multimodal/vl2l/[dev]
-```
-- On Zsh
-```zsh
-pip install -e multimodal/vl2l/"[dev]"
-```
-
-### Post VL2L benchmarking CLI installation 
-
-After installation, you can check the CLI flags that `mlperf-inf-mm-vl2l` can take with:
-
-```bash
-mlperf-inf-mm-vl2l --help
-```
-
-You can enable shell autocompletion for `mlperf-inf-mm-vl2l` with:
-
-```bash
-mlperf-inf-mm-vl2l --install-completion
-```
-
-> [!NOTE]
-> Shell auto-completion will take effect once you restart the terminal.
-
-### Start an inference endpoint on your local host machine with vLLM
-
-Please refer to [this guide on how to launch vLLM for various Qwen3 VL MoE models](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3-VL.html).
-
-```bash
-docker run --gpus all \                                 # Use all the GPUs on this host machine.
-    -v ~/.cache/huggingface:/root/.cache/huggingface \  # Use the HuggingFace cache from your host machine.
-    -p 8000:8000 \                                      # This assumes the endpoint will use port 8000.
-    --ipc=host \                                        # The container can access and utilize the host's IPC mechanisms (e.g., shared memory).
-    vllm/vllm-openai:nightly \                          # You can also use the `:latest` container or a specific release.
-        --model Qwen/Qwen3-VL-235B-A22B-Instruct \      # Specifies the model for vLLM to deploy.
-        --tensor-parallel-size 8 \                      # 8-way tensor-parallel inference across 8 GPUs.
-        --limit-mm-per-prompt.video 0                   # The input requests will contain images only (i.e., no videos).
-```
-
-### Run the benchmark for the Offline scenario
-
-Performance only mode:
-
-```bash
-mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario offline --settings.test.mode performance_only
-```
-
-Accuracy only mode:
-
-```bash
-mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario offline --settings.test.mode accuracy_only
-```
-
-### Run the benchmark for the Server scenario
-
-Performance only mode:
-
-```bash
-mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario server --settings.test.mode performance_only
-```
-
-Accuracy only mode:
-
-```bash
-mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario server --settings.test.mode accuracy_only
-```
-
-### Evalute the response quality
-
-```bash
-mlperf-inf-mm-vl2l evaluate --filename output/mlperf_log_accuracy.json
-```
-
-## Docker
-
-[docker/](docker/) provides examples of Dockerfiles that install the VL2L benchmarking
-CLI into the container images of the inference engine. This is useful when you have to
-run both the inference engine and the VL2L benchmarking CLI inside the same container,
-for example, in a situation where you must use a GPU cluster managed by 
-[Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and
-[pyxis](https://github.com/NVIDIA/pyxis).
-
-As an illustrative example, assuming that you are at the root directory of the MLPerf 
-Inference repo:
-
-1. You can build a container image against the vLLM's
-`vllm/vllm-openai:v0.12.0` release by
-
-```bash
-docker build \
-    --build-arg BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0 \
-    --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=multimodal/vl2l \
-    -f multimodal/vl2l/docker/vllm-cuda.Dockerfile \
-    -t mlperf-inf-mm-vl2l:vllm-openai-v0.12.0 \
-    .
-```
-> [!NOTE]
-> `MLPERF_INF_MM_VL2L_INSTALL_URL` can also take in a remote GitHub location, such as
-> `git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/vl2l/`.
-
-2. Afterwards, you can start the container in the interactive mode by
-
-```bash
-docker run --rm -it --gpus all -v ~/.cache:/root/.cache --ipc=host mlperf-inf-mm-vl2l:vllm-openai-v0.12.0
-```
-
-### Benchmark against vLLM inside the container
-
-If you are running `mlperf-inf-mm-vl2l` inside a local environment that has access to
-vLLM (such as inside a container that was created using the 
-[docker/vllm-cuda.Dockerfile](docker/vllm-cuda.Dockerfile)), you can use a single
-`mlperf-inf-mm-vl2l benchmark vllm` command to achieve:
-
-1. Deploy an endpoint using vLLM.
-2. Wait for the endpoint to be healthy.
-3. Run the benchmark against that endpoint.
-
-For example, inside the container, you can run the Offline scenario Accuracy only
-mode with:
-
-```bash
-mlperf-inf-mm-vl2l benchmark vllm \
-    --settings.test.scenario offline \
-    --settings.test.mode accuracy_only \
-    --dataset.token ... \
-    --vllm.cli=--async-scheduling \
-    --vllm.cli=--max-model-len=32768 \
-    --vllm.cli=--max-num-seqs=1024 \
-    --vllm.cli=--compilation-config='{
-        "cudagraph_capture_sizes": [
-            1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128,
-            136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248,
-            256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480,
-            496, 512, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768
-        ]
-    }' \
-    --vllm.cli=--limit-mm-per-prompt.video=0 \
-    --vllm.cli=--tensor-parallel-size=8 
-```
-
-## Developer Guide
-
-### Linting
-
-You can lint the VL2L benchmark source code by running the following script:
-
-```bash
-bash multimodal/vl2l/scripts/linters.sh
-```
\ No newline at end of file
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
deleted file mode 100644
index 5b325fff80..0000000000
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ /dev/null
@@ -1,513 +0,0 @@
-"""Schema definitions of various data structures in the VL2L benchmark."""
-
-from __future__ import annotations
-
-from datetime import timedelta
-from enum import StrEnum, auto
-from pathlib import Path
-from typing import Annotated, ClassVar, Self
-
-import mlperf_loadgen as lg
-from openai.types import ResponseFormatJSONSchema
-from openai.types.chat import ChatCompletionMessageParam
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    DirectoryPath,
-    Field,
-    NonNegativeInt,
-    field_validator,
-    model_validator,
-)
-
-MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES = 100
-ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES = 1 * 1024 * 1024 * 1024  # 1GB
-
-
-class TestScenario(StrEnum):
-    """The test scenario for the MLPerf inference LoadGen."""
-
-    SERVER = auto()
-    """Run the benchmark in server/interactive scenario."""
-
-    OFFLINE = auto()
-    """Run the benchmark in offline/batch scenario."""
-
-    def to_lgtype(self) -> lg.TestScenario:
-        """Convert the test scenario to its corresponding LoadGen type."""
-        match self:
-            case TestScenario.SERVER:
-                return lg.TestScenario.Server
-            case TestScenario.OFFLINE:
-                return lg.TestScenario.Offline
-            case _:
-                raise UnknownTestScenarioValueError(self)
-
-
-class UnknownTestScenarioValueError(ValueError):
-    """The exception raised when an unknown test scenario is encountered."""
-
-    def __init__(self, test_scenario: TestScenario) -> None:
-        """Initialize the exception."""
-        super().__init__(f"Unknown test scenario: {test_scenario}")
-
-
-class TestMode(StrEnum):
-    """The test mode for the MLPerf inference LoadGen."""
-
-    PERFORMANCE_ONLY = auto()
-    """Run the benchmark to evaluate performance."""
-
-    ACCURACY_ONLY = auto()
-    """Run the benchmark to evaluate model quality."""
-
-    def to_lgtype(self) -> lg.TestMode:
-        """Convert the test mode to its corresponding LoadGen type."""
-        match self:
-            case TestMode.PERFORMANCE_ONLY:
-                return lg.TestMode.PerformanceOnly
-            case TestMode.ACCURACY_ONLY:
-                return lg.TestMode.AccuracyOnly
-            case _:
-                raise UnknownTestModeValueError(self)
-
-
-class UnknownTestModeValueError(ValueError):
-    """The exception raised when an unknown test mode is encountered."""
-
-    def __init__(self, test_mode: TestMode) -> None:
-        """Initialize the exception."""
-        super().__init__(f"Unknown test mode: {test_mode}")
-
-
-class LoggingMode(StrEnum):
-    """Specifies when logging should be sampled and stringified."""
-
-    ASYNC_POLL = auto()
-    """ Logs are serialized and output on an IOThread that polls for new logs
-      at a fixed interval. This is the only mode currently implemented."""
-
-    END_OF_TEST_ONLY = auto()
-    """ Not implemented """
-
-    SYNCHRONOUS = auto()
-    """ Not implemented """
-
-    def to_lgtype(self) -> lg.LoggingMode:
-        """Convert logging mode to its corresponding LoadGen type."""
-        match self:
-            case LoggingMode.ASYNC_POLL:
-                return lg.LoggingMode.AsyncPoll
-            case _:
-                raise UnknownLoggingModeValueError(self)
-
-
-class UnknownLoggingModeValueError(ValueError):
-    """The exception raised when an unknown logging mode is encountered."""
-
-    def __init__(self, logging_mode: LoggingMode) -> None:
-        """Initialize the exception."""
-        super().__init__(f"Unknown logging mode: {logging_mode}")
-
-
-class BaseModelWithAttributeDescriptionsFromDocstrings(BaseModel):
-    """Base model that automatically adds attribute descriptions from docstrings."""
-
-    model_config = ConfigDict(use_attribute_docstrings=True, extra="forbid")
-    """Pydantic settings for
-    - Automatically add the attribute descriptions from docstrings.
-    - Forbid extra attributes.
-    """
-
-
-_DEFAULT_DATASET_SIZE = 48289
-_DEFAULT_MIN_DURATION = timedelta(minutes=10)
-_DEFAULT_OFFLINE_EXPECTED_QPS = (
-    _DEFAULT_DATASET_SIZE / _DEFAULT_MIN_DURATION.total_seconds()
-)
-
-
-class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """The test settings for the MLPerf inference LoadGen."""
-
-    scenario: TestScenario = TestScenario.OFFLINE
-    """The MLPerf inference benchmarking scenario to run the benchmark in."""
-
-    mode: TestMode = TestMode.PERFORMANCE_ONLY
-    """Whether you want to run the benchmark for performance measurement or accuracy
-    evaluation.
-    """
-
-    offline_expected_qps: float = _DEFAULT_OFFLINE_EXPECTED_QPS
-    """The expected QPS for the offline scenario."""
-
-    # sample_concatenate_permutation: bool = True  # noqa: ERA001
-    # """Affects the order in which the samples of the dataset are chosen.
-    # If `False`, it concatenates a single permutation of the dataset (or part
-    # of it depending on `performance_sample_count_override`) several times up to the
-    # number of samples requested.
-    # If `True`, it concatenates a multiple permutation of the dataset (or a
-    # part of it depending on `performance_sample_count_override`) several times
-    # up to the number of samples requested.
-    # """
-
-    server_expected_qps: float = 10
-    """The expected QPS for the server scenario. Loadgen will try to send as many
-    request as necessary to achieve this value.
-    """
-
-    server_target_latency: timedelta = timedelta(seconds=1)
-    """Expected latency constraint for Server scenario. This is a constraint that we
-    expect depending on the argument server_expected_qps. When server_expected_qps
-    increases, we expect the latency to also increase. When server_expected_qps
-    decreases, we expect the latency to also decrease.
-    """
-
-    server_ttft_latency: timedelta = timedelta(seconds=1)
-    """Time to First Token (TTFT) latency constraint result validation (used when
-    use_token_latencies is enabled).
-    """
-
-    server_tpot_latency: timedelta = timedelta(seconds=1)
-    """Time per Output Token (TPOT) latency constraint result validation (used when
-    use_token_latencies is enabled).
-    """
-
-    min_duration: timedelta = _DEFAULT_MIN_DURATION
-    """The minimum testing duration (in seconds or ISO 8601 format like `PT5S`). The
-    benchmark runs until this value has been met.
-    """
-
-    min_query_count: int = _DEFAULT_DATASET_SIZE
-    """The minimum testing query count. The benchmark runs until this value has been
-    met. If min_query_count is less than the total number of samples in the dataset,
-    only the first min_query_count samples will be used during testing.
-    """
-
-    performance_sample_count_override: Annotated[
-        NonNegativeInt,
-        Field(
-            description="The number of samples to use for the performance test. In the "  # noqa: S608
-            "performance mode, the benchmark will select P random samples from the "
-            "dataset, then send enough queries using these P samples (and repeating "
-            "them if necessary) to reach the min_duration and min_query_count. If a "
-            "non-zero value is passed to this flag, the P will be this value. "
-            "Otherwise, the benchmark will estimate how many samples can be loaded into"
-            f" {ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES} bytes of memory "
-            "based on the memory footprint of randomly selected "
-            f"{MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES} samples (at most), and then"
-            " use this estimation as the value P.",
-        ),
-    ] = _DEFAULT_DATASET_SIZE
-
-    use_token_latencies: bool = False
-    """By default, the Server scenario will use `server_target_latency` as the
-    constraint. When set to True, the Server scenario will use `server_ttft_latency` and
-    `server_tpot_latency` as the constraint.
-    """
-
-    @field_validator(
-        "server_target_latency",
-        "server_ttft_latency",
-        "server_tpot_latency",
-        "min_duration",
-        mode="before",
-    )
-    @classmethod
-    def parse_timedelta(cls, value: timedelta | float |
-                        str) -> timedelta | str:
-        """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
-        if isinstance(value, timedelta):
-            return value
-        if isinstance(value, (int, float)):
-            return timedelta(seconds=value)
-        if isinstance(value, str):
-            # Try to parse as a number first
-            try:
-                return timedelta(seconds=float(value))
-            except ValueError:
-                # If it fails, it might be ISO 8601 format
-                # Let pydantic's default parser handle it
-                pass
-        return value
-
-    def to_lgtype(self) -> lg.TestSettings:
-        """Convert the test settings to its corresponding LoadGen type."""
-        settings = lg.TestSettings()
-        settings.scenario = self.scenario.to_lgtype()
-        settings.mode = self.mode.to_lgtype()
-        settings.offline_expected_qps = self.offline_expected_qps
-        settings.server_target_qps = self.server_expected_qps
-        settings.server_target_latency_ns = round(
-            self.server_target_latency.total_seconds() * 1e9,
-        )
-        settings.ttft_latency = round(
-            self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(
-            self.server_tpot_latency.total_seconds() * 1e9)
-        settings.min_duration_ms = round(
-            self.min_duration.total_seconds() * 1000)
-        settings.min_query_count = self.min_query_count
-        settings.performance_sample_count_override = (
-            self.performance_sample_count_override
-        )
-        settings.use_token_latencies = self.use_token_latencies
-        return settings
-
-
-class LogOutputSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """The test log output settings for the MLPerf inference LoadGen."""
-
-    outdir: DirectoryPath = DirectoryPath("./output")
-    """Where to save the output files from the benchmark."""
-
-    prefix: str = "mlperf_log_"
-    """Modify the filenames of the logs with a prefix."""
-
-    suffix: str = ""
-    """Modify the filenames of the logs with a suffix."""
-
-    prefix_with_datetime: bool = False
-    """Modify the filenames of the logs with a datetime."""
-
-    copy_detail_to_stdout: bool = False
-    """Print details of performance test to stdout."""
-
-    copy_summary_to_stdout: bool = True
-    """Print results of performance test to terminal."""
-
-    @field_validator("outdir", mode="before")
-    @classmethod
-    def parse_directory_field(cls, value: str) -> Path:
-        """Verify and create the output directory to store log files."""
-        path = Path(value)
-        path.mkdir(exist_ok=True)
-        return path
-
-    def to_lgtype(self) -> lg.LogOutputSettings:
-        """Convert the log output settings to its corresponding LoadGen type."""
-        log_output_settings = lg.LogOutputSettings()
-        log_output_settings.outdir = self.outdir.as_posix()
-        log_output_settings.prefix = self.prefix
-        log_output_settings.suffix = self.suffix
-        log_output_settings.prefix_with_datetime = self.prefix_with_datetime
-        log_output_settings.copy_detail_to_stdout = self.copy_detail_to_stdout
-        log_output_settings.copy_summary_to_stdout = self.copy_summary_to_stdout
-        return log_output_settings
-
-
-class LogSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """The test log settings for the MLPerf inference LoadGen."""
-
-    log_output: LogOutputSettings = LogOutputSettings()
-    """Log output settings"""
-
-    log_mode: LoggingMode = LoggingMode.ASYNC_POLL
-    """How and when logging should be sampled and stringified at runtime"""
-
-    enable_trace: bool = True
-    """Enable trace"""
-
-    def to_lgtype(self) -> lg.LogSettings:
-        """Convert log settings to its corresponding LoadGen type."""
-        log_settings = lg.LogSettings()
-        log_settings.log_output = self.log_output.to_lgtype()
-        log_settings.log_mode = self.log_mode.to_lgtype()
-        log_settings.enable_trace = self.enable_trace
-        return log_settings
-
-
-class Settings(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """Combine the settings for the test and logging of LoadGen."""
-
-    test: TestSettings
-    """Test settings parameters."""
-
-    logging: LogSettings
-    """Test logging parameters."""
-
-    def to_lgtype(self) -> tuple[lg.TestSettings, lg.LogSettings]:
-        """Return test and log settings for LoadGen."""
-        test_settings = self.test.to_lgtype()
-        log_settings = self.logging.to_lgtype()
-        return (test_settings, log_settings)
-
-
-class Model(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """Specifies the model to use for the VL2L benchmark."""
-
-    repo_id: str = "Qwen/Qwen3-VL-235B-A22B-Instruct"
-    """The HuggingFace repository ID of the model."""
-
-
-class Dataset(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """Specifies a dataset on HuggingFace."""
-
-    repo_id: str = "Shopify/the-catalogue-public-beta"
-    """The HuggingFace repository ID of the dataset."""
-
-    token: str | None = None
-    """The token to access the HuggingFace repository of the dataset."""
-
-    revision: str | None = None
-    """The revision of the dataset. If not provided, the default revision (i.e., usually
-    `main`) will be used.
-    """
-
-    split: list[str] = ["train", "test"]
-    """Dataset splits to use for the benchmark, e.g., "train" and "test". You can add
-    multiple splits by repeating the same CLI flag multiple times, e.g.:
-    --dataset.split test --dataset.split train
-    The testing dataset is a concatenation of these splits in the same order.
-    """
-
-
-class Verbosity(StrEnum):
-    """The verbosity level of the logger."""
-
-    TRACE = auto()
-    """The trace verbosity level."""
-
-    DEBUG = auto()
-    """The debug verbosity level."""
-
-    INFO = auto()
-    """The info verbosity level (default)."""
-
-
-class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """Specifies the OpenAI API endpoint to use for the VL2L benchmark."""
-
-    url: str = "http://localhost:8000/v1"
-    """The URL of the OpenAI API endpoint that the inference requests are sent to."""
-
-    api_key: str = ""
-    """The API key to authenticate the inference requests."""
-
-    model: Model
-    """The model to use for the VL2L benchmark, i.e., the model that was deployed behind
-    this OpenAI API endpoint.
-    """
-
-    use_guided_decoding: bool = False
-    """If True, the benchmark will enable guided decoding for the requests. This
-    requires the endpoint (and the inference engine behind it) to support guided
-    decoding. If False, the response from the endpoint might not be directly parsable
-    by the response JSON schema (e.g., the JSON object might be fenced in a
-    ```json ... ``` code block).
-    """
-
-    request_timeout: timedelta = timedelta(hours=2)
-    """The timeout for the inference request to the endpoint. The default value for
-    OpenAI API client is 10 minutes
-    (https://github.com/openai/openai-python?tab=readme-ov-file#timeouts) which might
-    not be sufficient for the offline scenario.
-    """
-
-
-class EndpointToDeploy(Endpoint):
-    """Specifies the endpoint to deploy for the VL2L benchmark."""
-
-    startup_timeout: timedelta = timedelta(minutes=20)
-    """The timeout for the endpoint to start up."""
-
-    shutdown_timeout: timedelta = timedelta(minutes=1)
-    """The timeout for the endpoint to shut down."""
-
-    poll_interval: timedelta = timedelta(seconds=60)
-    """The interval to poll the endpoint for readiness."""
-
-    healthcheck_timeout: timedelta = timedelta(seconds=5)
-    """The timeout for the healthcheck request to the endpoint."""
-
-
-class VllmEndpoint(EndpointToDeploy):
-    """Specifies how to deploy an OpenAI API endpoint in vLLM for benchmarking."""
-
-    cli: list[str] = []
-    """The CLI arguments to pass to `vllm serve`. This excludes vllm's `--host`,
-    `--port`, --api-key` and `--model` CLI arguments which will be determined by
-    the `url`, `api_key` and `model` fields of this schema."""
-
-    @model_validator(mode="after")
-    def validate_cli(self) -> Self:
-        """Validate the vllm CLI arguments."""
-        for flag in self.cli:
-            if not flag.startswith(("--", "-")):
-                raise PositionalVllmCliFlagError(flag)
-            if flag.split("=", 1)[0] in BlacklistedVllmCliFlagError.BLACKLIST:
-                raise BlacklistedVllmCliFlagError(flag)
-        return self
-
-
-class PositionalVllmCliFlagError(ValueError):
-    """The exception raised when a positional vllm CLI flag is encountered."""
-
-    def __init__(self, flag: str) -> None:
-        """Initialize the exception."""
-        super().__init__(
-            f"Positional vllm CLI flag: {flag} is not allowed. Only optional flags are "
-            "allowed to be passed to `--vllm.cli`.",
-        )
-
-
-class BlacklistedVllmCliFlagError(ValueError):
-    """The exception raised when a blacklisted vllm CLI flag is encountered."""
-
-    BLACKLIST: ClassVar[list[str]] = [
-        "--model", "--host", "--port", "--api-key"]
-
-    def __init__(self, flag: str) -> None:
-        """Initialize the exception."""
-        super().__init__(
-            f"Blacklisted vllm CLI flag: {flag} is not allowed. The blacklisted flags"
-            f"are {self.BLACKLIST}.",
-        )
-
-
-class ProductMetadata(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """Json format for the expected responses from the VLM."""
-
-    category: str
-    """The complete category of the product, e.g.,
-    "Clothing & Accessories > Clothing > Shirts > Polo Shirts".
-    Each categorical level is separated by " > ".
-    """
-
-    brand: str
-    """The brand of the product, e.g., "giorgio armani"."""
-
-    is_secondhand: bool
-    """True if the product is second-hand, False otherwise."""
-
-
-class LoadedSample(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """Sample format to be used by LoadGen."""
-
-    messages: list[ChatCompletionMessageParam]
-    """The messages to be sent for chat completion to the VLM inference endpoint."""
-
-    response_format: ResponseFormatJSONSchema | None = None
-    """The response format to be used during guided decoding."""
-
-    @field_validator("messages", mode="after")
-    @classmethod
-    def ensure_content_is_list(
-        cls,
-        messages: list[ChatCompletionMessageParam],
-    ) -> list[ChatCompletionMessageParam]:
-        """If the content is a `ValidatorIterator`, convert it back to a list.
-
-        This is to workaround a Pydantic bug. See
-        https://github.com/pydantic/pydantic/issues/9467 for more details.
-        """
-        for message in messages:
-            if (
-                "content" in message
-                and message["content"].__class__.__module__
-                == "pydantic_core._pydantic_core"
-                and message["content"].__class__.__name__ == "ValidatorIterator"
-            ):
-                message["content"] = list(
-                    message["content"])  # type: ignore[arg-type]
-        return messages