From 5b45943251cf7872db4a2106efd7dcc5666615fe Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Wed, 10 Dec 2025 01:13:32 -0500
Subject: [PATCH 01/39] enable VllmDeployer to fail fast if the underying vllm
 process failed.

---
 .../deploy.py                                 | 79 ++++++++++++++-----
 .../mlperf_inference_multimodal_vl2l/task.py  | 51 +++++++++---
 2 files changed, 99 insertions(+), 31 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
index 8db6acfa8a..9d10378dff 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
@@ -7,6 +7,7 @@
 import time
 from abc import ABC, abstractmethod
 from datetime import timedelta  # noqa: TC003
+from pathlib import Path
 from typing import TYPE_CHECKING, Self
 from urllib.parse import urlparse
 
@@ -100,11 +101,17 @@ def _startup(self) -> None:
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def _failfast(self) -> None:
+        """Raise an exception if the endpoint is already detected to be dead."""
+        raise NotImplementedError
+
     def _wait_for_ready(self) -> None:
         """Wait for the endpoint to be ready."""
         health_url = self.endpoint.url.rstrip("/v1") + "/health"
         start_time = time.time()
         while time.time() - start_time < self.endpoint.startup_timeout.total_seconds():
+            self._failfast()
             logger.info(
                 "Waiting {:0.2f} seconds for endpoint to be ready...",
                 time.time() - start_time,
@@ -134,6 +141,31 @@ def _shutdown(self) -> None:
         raise NotImplementedError
 
 
+class LocalProcessNotStartedError(RuntimeError):
+    """The exception raised when the local process is not started yet."""
+
+    def __init__(self) -> None:
+        """Initialize the exception."""
+        super().__init__("Local process is not started yet.")
+
+
+class LocalProcessDeadError(RuntimeError):
+    """The exception raised when the local process is already detected to be dead."""
+
+    def __init__(
+        self,
+        returncode: int,
+        stdout_file_path: Path,
+        stderr_file_path: Path,
+    ) -> None:
+        """Initialize the exception."""
+        super().__init__(
+            f"Local process has already terminated with return code {returncode}. "
+            f"Please check the logs in {stdout_file_path} and "
+            f"{stderr_file_path} for more details.",
+        )
+
+
 class LocalProcessDeployer(EndpointDeployer):
     """Deploy and manage an endpoint that is powered by a local process."""
 
@@ -146,6 +178,14 @@ def __init__(self, endpoint: EndpointToDeploy, settings: Settings) -> None:
         """
         super().__init__(endpoint=endpoint, settings=settings)
         self._process: subprocess.Popen | None = None
+        self._stdout_file_path = get_log_file_path(
+            key=self._stdout_log_file_key,
+            settings=self.settings,
+        )
+        self._stderr_file_path = get_log_file_path(
+            key=self._stderr_log_file_key,
+            settings=self.settings,
+        )
 
     @abstractmethod
     def _build_command(self) -> list[str]:
@@ -168,38 +208,40 @@ def _startup(self) -> None:
         """Start the local process."""
         cmd = self._build_command()
         logger.info("Starting local process with command: {}", cmd)
-        logger.info(
-            "Starting local process with environment variables: {}",
-            os.environ)
-
-        # Get log file paths
-        stdout_file_path = get_log_file_path(
-            key=self._stdout_log_file_key,
-            settings=self.settings,
-        )
-        stderr_file_path = get_log_file_path(
-            key=self._stderr_log_file_key,
-            settings=self.settings,
-        )
+        logger.info("Starting local process with environment variables: {}", os.environ)
 
         # Start the server
         process = subprocess.Popen(  # noqa: S603
             cmd,
-            stdout=stdout_file_path.open("w"),
-            stderr=stderr_file_path.open("w"),
+            stdout=self._stdout_file_path.open("w"),
+            stderr=self._stderr_file_path.open("w"),
             text=True,
         )
 
         logger.info("Started local process with PID: {}", process.pid)
         logger.info(
             "Local process stdout will be logged to: {}",
-            stdout_file_path)
+            self._stdout_file_path,
+        )
         logger.info(
             "Local process stderr will be logged to: {}",
-            stderr_file_path)
+            self._stderr_file_path,
+        )
 
         self._process = process
 
+    def _failfast(self) -> None:
+        """Raise an exception if the local process is already detected to be dead."""
+        if self._process is None:
+            raise LocalProcessNotStartedError
+        returncode = self._process.poll()
+        if returncode is not None:
+            raise LocalProcessDeadError(
+                returncode=returncode,
+                stdout_file_path=self._stdout_file_path,
+                stderr_file_path=self._stderr_file_path,
+            )
+
     def _shutdown(self) -> None:
         """Shut down the local process gracefully."""
         if self._process is None:
@@ -209,8 +251,7 @@ def _shutdown(self) -> None:
         # Try graceful termination first
         self._process.terminate()
         try:
-            self._process.wait(
-                timeout=self.endpoint.shutdown_timeout.total_seconds())
+            self._process.wait(timeout=self.endpoint.shutdown_timeout.total_seconds())
             logger.info("Local process terminated gracefully")
         except subprocess.TimeoutExpired:
             logger.warning(
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
index c63a690a32..e4739758d8 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
@@ -13,6 +13,7 @@
 from io import BytesIO
 from typing import Any
 
+import httpx
 import mlperf_loadgen as lg
 from datasets import load_dataset
 from loguru import logger
@@ -56,17 +57,20 @@ def __init__(
             revision=dataset.revision,
             split="+".join(dataset.split),
         )
-        logger.debug(
-            "Loaded {} samples from the dataset splits {}.",
+        logger.info(
+            "Imported {} samples from the dataset splits {}.",
             len(self.dataset),
             dataset.split,
         )
         self.endpoint = endpoint
+        request_timeout_seconds = endpoint.request_timeout.total_seconds()
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
-            http_client=DefaultAioHttpClient(),
+            http_client=DefaultAioHttpClient(
+                timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0),
+            ),
             api_key=endpoint.api_key,
-            timeout=endpoint.request_timeout.total_seconds(),
+            timeout=request_timeout_seconds,
         )
         self.event_loop, self.event_loop_thread = (
             self._create_event_loop_in_separate_thread()
@@ -183,9 +187,7 @@ def estimated_num_performance_samples(self) -> int:
         """
         estimation_indices = random.sample(
             range(self.total_num_samples),
-            k=min(
-                MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES,
-                self.total_num_samples),
+            k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples),
         )
         estimation_samples = [
             self.formulate_loaded_sample(
@@ -204,9 +206,10 @@ def estimated_num_performance_samples(self) -> int:
             self.total_num_samples,
         )
         logger.debug(
-            "Estimated number of performance samples that will be loaded into the host"
+            "Estimated number of performance samples that can be loaded into {} GB host"
             " memory before testing is {}.",
             result,
+            ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES / 1024 / 1024 / 1024,
         )
         if self.settings.performance_sample_count_override > 0:
             logger.debug(
@@ -226,11 +229,22 @@ def _load_samples_to_ram(query_sample_indices: list[int]) -> None:
             Args:
                 query_sample_indices: The indices of the samples to load to host memory.
             """
+            logger.info(
+                "Starting to load {} samples to RAM...",
+                len(query_sample_indices),
+            )
+            tic = time.perf_counter()
             for index in query_sample_indices:
                 self.loaded_samples[index] = self.formulate_loaded_sample(
                     self.dataset[index],
                     use_guided_decoding=self.endpoint.use_guided_decoding,
                 )
+            logger.info(
+                "Loaded {} samples to RAM, which took {} seconds and {} GB in total.",
+                len(query_sample_indices),
+                time.perf_counter() - tic,
+                asizeof.asizeof(self.loaded_samples) / 1024 / 1024 / 1024,
+            )
 
         def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             """Called by LoadGen to unload samples from host memory after testing.
@@ -239,9 +253,19 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
                 query_sample_indices: The indices of the samples to unload from host
                     memory.
             """
+            logger.info(
+                "Starting to unload {} samples from RAM...",
+                len(query_sample_indices),
+            )
+            tic = time.perf_counter()
             for index in query_sample_indices:
                 sample_to_unload = self.loaded_samples.pop(index, None)
                 del sample_to_unload
+            logger.info(
+                "Unloaded {} samples from RAM, which took {} seconds.",
+                len(query_sample_indices),
+                time.perf_counter() - tic,
+            )
 
         return lg.ConstructQSL(
             self.total_num_samples,
@@ -250,8 +274,7 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             _unload_samples_from_ram,
         )
 
-    async def _query_endpoint_async_batch(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         try:
             sample = self.loaded_samples[query_sample.index]
@@ -328,8 +351,7 @@ async def _query_endpoint_async_batch(
                 ],
             )
 
-    async def _query_endpoint_async_stream(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         ttft_set = False
         try:
@@ -472,6 +494,10 @@ def _issue_queries(query_samples: list[lg.QuerySample]) -> None:
 
         def _flush_queries() -> None:
             """Called by the LoadGen to indicate that all queries have been issued."""
+            logger.info(
+                "LoadGen has indicated that all queries have been issued. "
+                "Waiting for all pending queries to complete...",
+            )
 
             async def _wait_for_pending_queries_async() -> None:
                 """Wait for all pending queries to complete."""
@@ -494,6 +520,7 @@ async def _wait_for_pending_queries_async() -> None:
                 self.event_loop,
             )
             future.result()
+            logger.info("All pending queries has completed.")
 
         return lg.ConstructSUT(_issue_queries, _flush_queries)
 

From bad5387dbfc95fa145738cc704da41d26609080d Mon Sep 17 00:00:00 2001
From: Shang Wang <shangw@nvidia.com>
Date: Wed, 10 Dec 2025 13:08:55 -0800
Subject: [PATCH 02/39] example slurm script for submitting jobs

---
 multimodal/vl2l/scripts/slurm/benchmark.sh |  32 ++++
 multimodal/vl2l/scripts/slurm/evaluate.sh  |  18 ++
 multimodal/vl2l/scripts/slurm/submit.sh    | 204 +++++++++++++++++++++
 3 files changed, 254 insertions(+)
 create mode 100644 multimodal/vl2l/scripts/slurm/benchmark.sh
 create mode 100644 multimodal/vl2l/scripts/slurm/evaluate.sh
 create mode 100644 multimodal/vl2l/scripts/slurm/submit.sh

diff --git a/multimodal/vl2l/scripts/slurm/benchmark.sh b/multimodal/vl2l/scripts/slurm/benchmark.sh
new file mode 100644
index 0000000000..e564507575
--- /dev/null
+++ b/multimodal/vl2l/scripts/slurm/benchmark.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH --time=2:00:00
+#SBATCH --partition=batch_short
+#SBATCH --gres=gpu:8
+#SBATCH --tasks=1
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --output=benchmark-slurm-output-%j.txt
+#SBATCH --error=benchmark-slurm-error-%j.txt
+
+set -eux
+set -o pipefail
+
+mkdir -p ${OUTPUT_HOST_DIR}/${SLURM_JOB_ID}
+
+srun \
+    --container-image=${CONTAINER_IMAGE} \
+    --container-mounts=${CACHE_HOST_DIR}:${CACHE_CONTAINER_DIR},${OUTPUT_HOST_DIR}:${OUTPUT_CONTAINER_DIR} \
+    --no-container-mount-home \
+    mlperf-inf-mm-vl2l benchmark vllm \
+        --settings.test.scenario=${SCENARIO} \
+        --settings.test.mode=${MODE} \
+        --dataset.token=${DATASET_TOKEN} \
+        --vllm.model.repo_id=${MODEL_REPO_ID} \
+        --vllm.cli=--async-scheduling \
+        --vllm.cli=--max-model-len=32768 \
+        --vllm.cli=--max-num-seqs=1024 \
+        --vllm.cli=--mm-encoder-tp-mode=data \
+        --vllm.cli=--limit-mm-per-prompt.video=0 \
+        --vllm.cli=--tensor-parallel-size=8 \
+        --settings.logging.log_output.outdir=${OUTPUT_CONTAINER_DIR}/${SLURM_JOB_ID} 
\ No newline at end of file
diff --git a/multimodal/vl2l/scripts/slurm/evaluate.sh b/multimodal/vl2l/scripts/slurm/evaluate.sh
new file mode 100644
index 0000000000..176c92dc46
--- /dev/null
+++ b/multimodal/vl2l/scripts/slurm/evaluate.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+#SBATCH --time=1:00:00
+#SBATCH --partition=cpu_short
+#SBATCH --nodes=1
+#SBATCH --tasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem-per-cpu=16G
+#SBATCH --output=evaluate-slurm-output-%j.txt
+#SBATCH --error=evaluate-slurm-error-%j.txt
+
+srun \
+    --container-image=${CONTAINER_IMAGE} \
+    --container-mounts=${CACHE_HOST_DIR}:${CACHE_CONTAINER_DIR},${OUTPUT_HOST_DIR}:${OUTPUT_CONTAINER_DIR} \
+    --no-container-mount-home \
+    mlperf-inf-mm-vl2l evaluate \
+        --dataset.token=${DATASET_TOKEN} \
+        --filename=${OUTPUT_CONTAINER_DIR}/${SLURM_JOB_ID}/mlperf_log_accuracy.json
\ No newline at end of file
diff --git a/multimodal/vl2l/scripts/slurm/submit.sh b/multimodal/vl2l/scripts/slurm/submit.sh
new file mode 100644
index 0000000000..5194abdeac
--- /dev/null
+++ b/multimodal/vl2l/scripts/slurm/submit.sh
@@ -0,0 +1,204 @@
+#!/bin/bash
+
+set -eux
+set -o pipefail
+
+DEFAULT_CONTAINER_IMAGE=""
+container_image=${DEFAULT_CONTAINER_IMAGE}
+
+DEFAULT_DATASET_TOKEN=""
+dataset_token=${DEFAULT_DATASET_TOKEN}
+
+DEFAULT_MODEL_REPO_ID=Qwen/Qwen3-VL-235B-A22B-Instruct
+model_repo_id=${DEFAULT_MODEL_REPO_ID}
+
+DEFAULT_SCENARIO=offline
+scenario=${DEFAULT_SCENARIO}
+
+DEFAULT_MODE=accuracy_only
+mode=${DEFAULT_MODE}
+
+DEFAULT_CACHE_HOST_DIR=""
+cache_host_dir=${DEFAULT_CACHE_HOST_DIR}
+
+DEFAULT_OUTPUT_HOST_DIR=$(pwd)/outputs
+output_host_dir=${DEFAULT_OUTPUT_HOST_DIR}
+
+DEFAULT_SLURM_ACCOUNT=""
+slurm_account=${DEFAULT_SLURM_ACCOUNT}
+
+DEFAULT_BENCHMARK_SLURM_PARTITION=""
+benchmark_slurm_partition=${DEFAULT_BENCHMARK_SLURM_PARTITION}
+
+DEFAULT_EVALUATE_SLURM_PARTITION=""
+evaluate_slurm_partition=${DEFAULT_EVALUATE_SLURM_PARTITION}
+
+function _exit_with_help_msg() {
+    cat <<EOF
+Submit a benchmarking (and optionally, an evaluation) job(s) for the VL2L benchmark.
+
+Usage: ${BASH_SOURCE[0]}
+    [-ci  | --container-image]     Container image to run the benchmark (default: ${DEFAULT_CONTAINER_IMAGE}).
+    [-dt  | --dataset-token]       Access token for the Shopify Global Catalogue dataset (default: ${DEFAULT_DATASET_TOKEN}).
+    [-mri | --model-repo-id]       HuggingFace repo ID of the model to benchmark (default: ${DEFAULT_MODEL_REPO_ID}).
+    [-s | --scenario]              Benchmark scenario (default: ${DEFAULT_SCENARIO}).
+    [-m | --mode]                  Benchmark mode (default: ${DEFAULT_MODE}).
+    [-chd | --cache-host-dir]      Host directory of the `.cache` directory to which HuggingFace will dump the dataset and the model checkpoint, and vLLM will dump compilation artifacts (default: ${DEFAULT_CACHE_HOST_DIR}).
+    [-ohd | --output-host-dir]     Host directory to which the benchmark and evaluation results will be dumped (default: ${DEFAULT_OUTPUT_HOST_DIR}).
+    [-sa | --slurm-account]        Slurm account for submitting the benchmark and evaluation jobs (default: ${DEFAULT_SLURM_ACCOUNT}).
+    [-bsp | --benchmark-slurm-partition] Slurm partition for submitting the benchmarking job; usually a partition with nodes that have GPUs (default: ${DEFAULT_BENCHMARK_SLURM_PARTITION}).
+    [-esp | --evaluate-slurm-partition] Slurm partition for submitting the evaluation job; usually a partition with nodes that have CPUs only (default: ${DEFAULT_EVALUATE_SLURM_PARTITION}).
+    [-h | --help]     Print this help message.
+EOF
+    if [ -n "$1" ]; then
+        echo "$(tput bold setab 1)$1$(tput sgr0)"
+    fi
+    exit "$2"
+}
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+    -ci | --container-image)
+        container_image=$2
+        shift
+        shift
+        ;;
+    -ci=* | --container-image=*)
+        container_image=${1#*=}
+        shift
+        ;;
+    -dt | --dataset-token)
+        dataset_token=$2
+        shift
+        shift
+        ;;
+    -dt=* | --dataset-token=*)
+        dataset_token=${1#*=}
+        shift
+        ;;
+    -mri | --model-repo-id)
+        model_repo_id=$2
+        shift
+        shift
+        ;;
+    -mri=* | --model-repo-id=*)
+        model_repo_id=${1#*=}
+        shift
+        ;;
+    -s | --scenario)
+        scenario=$2
+        shift
+        shift
+        ;;
+    -s=* | --scenario=*)
+        scenario=${1#*=}
+        shift
+        ;;
+    -m | --mode)
+        mode=$2
+        shift
+        shift
+        ;;
+    -m=* | --mode=*)
+        mode=${1#*=}
+        shift
+        ;;
+    -chd | --cache-host-dir)
+        cache_host_dir=$2
+        shift
+        shift
+        ;;
+    -chd=* | --cache-host-dir=*)
+        cache_host_dir=${1#*=}
+        shift
+        ;;
+    -ohd | --output-host-dir)
+        output_host_dir=$2
+        shift
+        shift
+        ;;
+    -ohd=* | --output-host-dir=*)
+        output_host_dir=${1#*=}
+        shift
+        ;;
+    -sa | --slurm-account)
+        slurm_account=$2
+        shift
+        shift
+        ;;
+    -sa=* | --slurm-account=*)
+        slurm_account=${1#*=}
+        shift
+        ;;
+    -bsp | --benchmark-slurm-partition)
+        benchmark_slurm_partition=$2
+        shift
+        shift
+        ;;
+    -bsp=* | --benchmark-slurm-partition=*)
+        benchmark_slurm_partition=${1#*=}
+        shift
+        ;;
+    -esp | --evaluate-slurm-partition)
+        evaluate_slurm_partition=$2
+        shift
+        shift
+        ;;
+    -esp=* | --evaluate-slurm-partition=*)
+        evaluate_slurm_partition=${1#*=}
+        shift
+        ;;
+    -h | --help)
+        _exit_with_help_msg "" 0
+        ;;
+    *)
+        _exit_with_help_msg "[ERROR] Unknown option: $1" 1
+        ;;
+    esac
+done
+
+if [[ -z "${container_image}" ]]; then
+    _exit_with_help_msg "[ERROR] -ci or --container-image is required." 1
+fi
+
+if [[ -z "${dataset_token}" ]]; then
+    _exit_with_help_msg "[ERROR] -dt or --dataset-token is required." 1
+fi
+
+if [[ -z "${cache_host_dir}" ]]; then
+    _exit_with_help_msg "[ERROR] -chd or --cache-host-dir is required." 1
+fi
+
+if [[ -z "${slurm_account}" ]]; then
+    _exit_with_help_msg "[ERROR] -sa or --slurm-account is required." 1
+fi
+
+if [[ -z "${benchmark_slurm_partition}" ]]; then
+    _exit_with_help_msg "[ERROR] -bsp or --benchmark-slurm-partition is required." 1
+fi
+
+if [[ -z "${evaluate_slurm_partition}" ]]; then
+    _exit_with_help_msg "[ERROR] -esp or --evaluate-slurm-partition is required." 1
+fi
+
+cache_container_dir=/root/.cache
+output_container_dir=/outputs
+
+mkdir -p "${output_host_dir}"
+
+benchmark_job_id=$(
+    sbatch --parsable \
+        --export=CACHE_HOST_DIR="${cache_host_dir}",CACHE_CONTAINER_DIR="${cache_container_dir}",OUTPUT_HOST_DIR="${output_host_dir}",OUTPUT_CONTAINER_DIR="${output_container_dir}",CONTAINER_IMAGE="${container_image}",SCENARIO="${scenario}",MODE="${mode}",DATASET_TOKEN="${dataset_token}",MODEL_REPO_ID="${model_repo_id}" \
+        --account="${slurm_account}" \
+        --partition="${benchmark_slurm_partition}" \
+        benchmark.sh
+)
+
+if [[ "${mode}" == "accuracy_only" ]]; then
+    sbatch \
+        --dependency=afterok:"${benchmark_job_id}" \
+        --export=CACHE_HOST_DIR="${cache_host_dir}",CACHE_CONTAINER_DIR="${cache_container_dir}",OUTPUT_HOST_DIR="${output_host_dir}",OUTPUT_CONTAINER_DIR="${output_container_dir}",CONTAINER_IMAGE="${container_image}",DATASET_TOKEN="${dataset_token}" \
+        --account="${slurm_account}" \
+        --partition="${evaluate_slurm_partition}" \
+        evaluate.sh
+fi
\ No newline at end of file

From 08b32cc9792c5789a4946f89d419b2b70157c245 Mon Sep 17 00:00:00 2001
From: Shang Wang <shangw@nvidia.com>
Date: Wed, 10 Dec 2025 21:33:49 -0800
Subject: [PATCH 03/39] fix slurm scripts

---
 multimodal/vl2l/scripts/slurm/benchmark.sh | 2 --
 multimodal/vl2l/scripts/slurm/evaluate.sh  | 5 ++++-
 multimodal/vl2l/scripts/slurm/submit.sh    | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/multimodal/vl2l/scripts/slurm/benchmark.sh b/multimodal/vl2l/scripts/slurm/benchmark.sh
index e564507575..ba5d64f41b 100644
--- a/multimodal/vl2l/scripts/slurm/benchmark.sh
+++ b/multimodal/vl2l/scripts/slurm/benchmark.sh
@@ -25,8 +25,6 @@ srun \
         --vllm.model.repo_id=${MODEL_REPO_ID} \
         --vllm.cli=--async-scheduling \
         --vllm.cli=--max-model-len=32768 \
-        --vllm.cli=--max-num-seqs=1024 \
-        --vllm.cli=--mm-encoder-tp-mode=data \
         --vllm.cli=--limit-mm-per-prompt.video=0 \
         --vllm.cli=--tensor-parallel-size=8 \
         --settings.logging.log_output.outdir=${OUTPUT_CONTAINER_DIR}/${SLURM_JOB_ID} 
\ No newline at end of file
diff --git a/multimodal/vl2l/scripts/slurm/evaluate.sh b/multimodal/vl2l/scripts/slurm/evaluate.sh
index 176c92dc46..adc6a66265 100644
--- a/multimodal/vl2l/scripts/slurm/evaluate.sh
+++ b/multimodal/vl2l/scripts/slurm/evaluate.sh
@@ -9,10 +9,13 @@
 #SBATCH --output=evaluate-slurm-output-%j.txt
 #SBATCH --error=evaluate-slurm-error-%j.txt
 
+export NVIDIA_VISIBLE_DEVICES=void
+
 srun \
     --container-image=${CONTAINER_IMAGE} \
     --container-mounts=${CACHE_HOST_DIR}:${CACHE_CONTAINER_DIR},${OUTPUT_HOST_DIR}:${OUTPUT_CONTAINER_DIR} \
     --no-container-mount-home \
+    --container-env=NVIDIA_VISIBLE_DEVICES \
     mlperf-inf-mm-vl2l evaluate \
         --dataset.token=${DATASET_TOKEN} \
-        --filename=${OUTPUT_CONTAINER_DIR}/${SLURM_JOB_ID}/mlperf_log_accuracy.json
\ No newline at end of file
+        --filename=${OUTPUT_CONTAINER_DIR}/${BENCHMARK_JOB_ID}/mlperf_log_accuracy.json
\ No newline at end of file
diff --git a/multimodal/vl2l/scripts/slurm/submit.sh b/multimodal/vl2l/scripts/slurm/submit.sh
index 5194abdeac..418e6c65ce 100644
--- a/multimodal/vl2l/scripts/slurm/submit.sh
+++ b/multimodal/vl2l/scripts/slurm/submit.sh
@@ -197,7 +197,7 @@ benchmark_job_id=$(
 if [[ "${mode}" == "accuracy_only" ]]; then
     sbatch \
         --dependency=afterok:"${benchmark_job_id}" \
-        --export=CACHE_HOST_DIR="${cache_host_dir}",CACHE_CONTAINER_DIR="${cache_container_dir}",OUTPUT_HOST_DIR="${output_host_dir}",OUTPUT_CONTAINER_DIR="${output_container_dir}",CONTAINER_IMAGE="${container_image}",DATASET_TOKEN="${dataset_token}" \
+        --export=CACHE_HOST_DIR="${cache_host_dir}",CACHE_CONTAINER_DIR="${cache_container_dir}",OUTPUT_HOST_DIR="${output_host_dir}",OUTPUT_CONTAINER_DIR="${output_container_dir}",CONTAINER_IMAGE="${container_image}",DATASET_TOKEN="${dataset_token}",BENCHMARK_JOB_ID="${benchmark_job_id}" \
         --account="${slurm_account}" \
         --partition="${evaluate_slurm_partition}" \
         evaluate.sh

From 1cdf56348d198bc08d1850edb2f27d66fe5637ed Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Thu, 11 Dec 2025 00:39:42 -0500
Subject: [PATCH 04/39] small fix

---
 multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
index e4739758d8..98831308ad 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
@@ -208,8 +208,8 @@ def estimated_num_performance_samples(self) -> int:
         logger.debug(
             "Estimated number of performance samples that can be loaded into {} GB host"
             " memory before testing is {}.",
-            result,
             ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES / 1024 / 1024 / 1024,
+            result,
         )
         if self.settings.performance_sample_count_override > 0:
             logger.debug(

From d9caddceb78098f78a45ec1498e1b9211432e29a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 11 Dec 2025 05:40:12 +0000
Subject: [PATCH 05/39] [Automated Commit] Format Codebase

---
 .../src/mlperf_inference_multimodal_vl2l/task.py    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
index 98831308ad..86edb1cb74 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
@@ -67,7 +67,8 @@ def __init__(
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
             http_client=DefaultAioHttpClient(
-                timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0),
+                timeout=httpx.Timeout(
+                    timeout=request_timeout_seconds, connect=5.0),
             ),
             api_key=endpoint.api_key,
             timeout=request_timeout_seconds,
@@ -187,7 +188,9 @@ def estimated_num_performance_samples(self) -> int:
         """
         estimation_indices = random.sample(
             range(self.total_num_samples),
-            k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples),
+            k=min(
+                MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES,
+                self.total_num_samples),
         )
         estimation_samples = [
             self.formulate_loaded_sample(
@@ -274,7 +277,8 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             _unload_samples_from_ram,
         )
 
-    async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_batch(
+            self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         try:
             sample = self.loaded_samples[query_sample.index]
@@ -351,7 +355,8 @@ async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> Non
                 ],
             )
 
-    async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_stream(
+            self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         ttft_set = False
         try:

From 6f6233997b6eacc99ba6344a01bff29792015018 Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Thu, 11 Dec 2025 00:55:53 -0500
Subject: [PATCH 06/39] Update the readme about the example slurm scripts.

---
 multimodal/vl2l/README.md | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md
index 5720fb7fd7..923f00481c 100644
--- a/multimodal/vl2l/README.md
+++ b/multimodal/vl2l/README.md
@@ -182,6 +182,34 @@ mlperf-inf-mm-vl2l benchmark vllm \
     --vllm.cli=--tensor-parallel-size=8 
 ```
 
+## Slurm
+
+[scripts/slurm/](scripts/slurm/) provide example scripts of running both the benchmark 
+and the response quality evaluation in a GPU cluster managed by 
+[Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and
+[pyxis](https://github.com/NVIDIA/pyxis). Specifically,
+
+- [scripts/slurm/benchmark.sh](scripts/slurm/benchmark.sh) is a sbatch script that 
+  runs the benchmarking job.
+- [scripts/slurm/evaluate.sh](scripts/slurm/evaluate.sh) is a sbatch script that runs
+  the evaluation job.
+- [scripts/slurm/submit.sh](scripts/slurm/submit.sh) is a Bash script that submits both
+  jobs, where the evaluation job would only run if the benchmarking job has succeeded.
+
+You can check the CLI flags that [scripts/slurm/submit.sh](scripts/slurm/submit.sh) can
+take via:
+
+```bash
+bash submit.sh --help
+```
+
+> [!NOTE]
+> Slurm clusters are often highly customized per organization. If you are unfamiliar
+> with Slurm, you should check with the cluster administrator of your organization
+> first, get a good understanding of what those example scripts do, and adapt the 
+> example scripts to the specific settings for the Slurm cluster that you are going
+> to use, before you try to launch any jobs.
+
 ## Developer Guide
 
 ### Linting

From 59dc16744c50fb32ef918a86f7aee1a902caa2f1 Mon Sep 17 00:00:00 2001
From: Shang Wang <shangw@nvidia.com>
Date: Thu, 11 Dec 2025 11:39:03 -0800
Subject: [PATCH 07/39] Change the default endpoint startup timeout to 1 hour
 in case someone need to download the model for the fisrt time.

---
 multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
index 5b325fff80..9b4c7a7209 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
@@ -408,7 +408,7 @@ class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings):
 class EndpointToDeploy(Endpoint):
     """Specifies the endpoint to deploy for the VL2L benchmark."""
 
-    startup_timeout: timedelta = timedelta(minutes=20)
+    startup_timeout: timedelta = timedelta(hours=1)
     """The timeout for the endpoint to start up."""
 
     shutdown_timeout: timedelta = timedelta(minutes=1)

From d9c0bcc2cf557519754310a4b196f8ae8e5afaae Mon Sep 17 00:00:00 2001
From: John Calderon <jcalderon@nvidia.com>
Date: Thu, 11 Dec 2025 15:04:43 -0500
Subject: [PATCH 08/39] change servr expected qps and target latency

---
 .../vl2l/src/mlperf_inference_multimodal_vl2l/schema.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
index 9b4c7a7209..05056edcdf 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
@@ -151,12 +151,12 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
     # up to the number of samples requested.
     # """
 
-    server_expected_qps: float = 10
+    server_expected_qps: float = 5
     """The expected QPS for the server scenario. Loadgen will try to send as many
     request as necessary to achieve this value.
     """
 
-    server_target_latency: timedelta = timedelta(seconds=1)
+    server_target_latency: timedelta = timedelta(seconds=13)
     """Expected latency constraint for Server scenario. This is a constraint that we
     expect depending on the argument server_expected_qps. When server_expected_qps
     increases, we expect the latency to also increase. When server_expected_qps

From a75dc68e1ee71b7c9fa94bc2b3caf20f487e75b7 Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Fri, 12 Dec 2025 14:07:27 -0500
Subject: [PATCH 09/39] Change the default dataset repo_id to the new name of
 the public dataset

---
 .../schema.py                                 | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
index 05056edcdf..76ffd222c8 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
@@ -214,8 +214,7 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float |
-                        str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -241,12 +240,9 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.server_target_latency_ns = round(
             self.server_target_latency.total_seconds() * 1e9,
         )
-        settings.ttft_latency = round(
-            self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(
-            self.server_tpot_latency.total_seconds() * 1e9)
-        settings.min_duration_ms = round(
-            self.min_duration.total_seconds() * 1000)
+        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
+        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
         settings.performance_sample_count_override = (
             self.performance_sample_count_override
@@ -343,7 +339,7 @@ class Model(BaseModelWithAttributeDescriptionsFromDocstrings):
 class Dataset(BaseModelWithAttributeDescriptionsFromDocstrings):
     """Specifies a dataset on HuggingFace."""
 
-    repo_id: str = "Shopify/the-catalogue-public-beta"
+    repo_id: str = "Shopify/product-catalogue"
     """The HuggingFace repository ID of the dataset."""
 
     token: str | None = None
@@ -454,8 +450,7 @@ def __init__(self, flag: str) -> None:
 class BlacklistedVllmCliFlagError(ValueError):
     """The exception raised when a blacklisted vllm CLI flag is encountered."""
 
-    BLACKLIST: ClassVar[list[str]] = [
-        "--model", "--host", "--port", "--api-key"]
+    BLACKLIST: ClassVar[list[str]] = ["--model", "--host", "--port", "--api-key"]
 
     def __init__(self, flag: str) -> None:
         """Initialize the exception."""
@@ -508,6 +503,5 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(
-                    message["content"])  # type: ignore[arg-type]
+                message["content"] = list(message["content"])  # type: ignore[arg-type]
         return messages

From 866eba99eb330ab9821cee33320334806b2dee76 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 12 Dec 2025 19:08:03 +0000
Subject: [PATCH 10/39] [Automated Commit] Format Codebase

---
 .../mlperf_inference_multimodal_vl2l/schema.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
index 76ffd222c8..aa848db3f2 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
@@ -214,7 +214,8 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float |
+                        str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -240,9 +241,12 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.server_target_latency_ns = round(
             self.server_target_latency.total_seconds() * 1e9,
         )
-        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
-        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
+        settings.ttft_latency = round(
+            self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(
+            self.server_tpot_latency.total_seconds() * 1e9)
+        settings.min_duration_ms = round(
+            self.min_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
         settings.performance_sample_count_override = (
             self.performance_sample_count_override
@@ -450,7 +454,8 @@ def __init__(self, flag: str) -> None:
 class BlacklistedVllmCliFlagError(ValueError):
     """The exception raised when a blacklisted vllm CLI flag is encountered."""
 
-    BLACKLIST: ClassVar[list[str]] = ["--model", "--host", "--port", "--api-key"]
+    BLACKLIST: ClassVar[list[str]] = [
+        "--model", "--host", "--port", "--api-key"]
 
     def __init__(self, flag: str) -> None:
         """Initialize the exception."""
@@ -503,5 +508,6 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(message["content"])  # type: ignore[arg-type]
+                message["content"] = list(
+                    message["content"])  # type: ignore[arg-type]
         return messages

From a8a8870f2ac175f16331e80c1430cc18f3a4d1ea Mon Sep 17 00:00:00 2001
From: John Calderon <jcalderon@nvidia.com>
Date: Fri, 12 Dec 2025 15:05:26 -0500
Subject: [PATCH 11/39] evaluate the json file with multiprocess

---
 .../evaluation.py                             | 208 +++++++++++-------
 1 file changed, 133 insertions(+), 75 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
index 2076bdbab8..aaf95080f8 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
@@ -3,12 +3,13 @@
 from __future__ import annotations
 
 import json
+import os
+from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
 from datasets import load_dataset
-from hiclass.metrics import f1  # type: ignore[import-untyped]
 from loguru import logger
 from pydantic import ValidationError
 from rapidfuzz import fuzz  # type: ignore[import-untyped]
@@ -22,11 +23,12 @@
 
 from .schema import ProductMetadata
 
-_TRUE_CATEGORY_PAD = "<|__TRUE_CATEGORY_PAD__|>"
 _PRED_CATEGORY_PAD = "<|__PRED_CATEGORY_PAD__|>"
 _PRED_BRAND_PAD = "<|__PRED_BRAND_PAD__|>"
 _CATEGORY_SEPARATOR = " > "
 
+_WORKER_CONTEXT = {}
+_MAX_JOBS = 4
 
 def get_hierarchical_components(
     predicted_path: str,
@@ -110,7 +112,6 @@ def calculate_hierarchical_f1(
 
     return 0.0 if hp + hr == 0 else 2 * (hp * hr) / (hp + hr)
 
-
 def calculate_brand_f1_score(data: list[tuple[str, str]]) -> float:
     """Calculate the F1 score of brand field.
 
@@ -141,7 +142,6 @@ def calculate_brand_f1_score(data: list[tuple[str, str]]) -> float:
     # For 1-to-1 extraction, Accuracy = Recall = Micro F1
     return sum(matches) / len(matches)
 
-
 def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float:
     """Calculate F1 score of is_secondhand field.
 
@@ -159,63 +159,40 @@ def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float:
     return f1_score(y_src, y_pred)
 
 
-def calculate_hiclass_f1(
-    data: list[tuple[str, str]],
-    separator: str = _CATEGORY_SEPARATOR,
-) -> float:
-    """Alt method to calculate hierarchical F1.
+def _process_chunk_rnd_brand(args: tuple[str, dict, dict]) -> tuple[str, str]:
+    """Function to process only chunks for random brand predictions.
 
     Args:
-        data: List of tuples of predicted and true values
-        separator: The separator used to split the paths into levels of the category.
-
-    Returs:
-        f1 score
+        args: Tuple containing
     """
-    y_pred_raw = []
-    y_true_raw = []
+    pred_brand, elem, data_source = args
+    # We pass the specific data row needed, or the whole structure if efficient
+    return (pred_brand, data_source[elem["qsl_idx"]]["ground_truth_brand"])
 
-    for pred, src in data:
-        path1 = pred.split(separator)
-        path2 = src.split(separator)
-
-        y_pred_raw.append(path1)
-        y_true_raw.append(path2)
-
-    # 2. Find the global maximum length across ALL samples
-    # We check the longest path in both true and pred lists
-    max_len = max(len(p) for p in y_true_raw + y_pred_raw)
-
-    # 3. Pad all lists to the global max_len
-    for i in range(len(y_true_raw)):
-        # Pad Truth
-        pad_len_true = max_len - len(y_true_raw[i])
-        y_true_raw[i] += [_TRUE_CATEGORY_PAD] * pad_len_true
+def init_worker(dataset: dict) -> None:
+    """Initialize worker data to process each chunk.
 
-        # Pad Prediction
-        pad_len_pred = max_len - len(y_pred_raw[i])
-        y_pred_raw[i] += [_PRED_CATEGORY_PAD] * pad_len_pred
+    Args:
+        dataset: huggingface dataset
+    """
+    _WORKER_CONTEXT["dataset"] = dataset
 
-    # 4. Convert to numpy arrays
-    y_true = np.array(y_true_raw)
-    y_pred = np.array(y_pred_raw)
+def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
+    """Retrieve relevant information from each chunk of data.
 
-    # 5. Calculate Score
-    return f1(y_true, y_pred)
+    Args:
+        args: Tuple that contains chunk of data and seed
 
+    Returns:
+        Object with processed information
+    """
+    chunk_data, seed = args
 
-def run_evaluation(random_seed: int, filename: FilePath,
-                   dataset: DatasetCLI) -> None:
-    """Main function to run the evaluation."""
-    rng = np.random.default_rng(seed=random_seed)
-    with Path.open(filename) as f:
-        model_output = json.load(f)
+    # 1. Access the global dataset
+    dataset = _WORKER_CONTEXT["dataset"]
 
-    original_data = load_dataset(
-        dataset.repo_id,
-        token=dataset.token,
-        split="+".join(dataset.split),
-    )
+    # 2. Create a local, reproducible RNG for this specific chunk
+    local_rng = np.random.default_rng(seed)
 
     num_unparsable_responses = 0
     category_dataset_pred_src = []
@@ -223,13 +200,13 @@ def run_evaluation(random_seed: int, filename: FilePath,
     is_secondhand_pred_src = []
     is_secondhand_rand_pred_src = []
     brand_pred_src = []
-
     all_possible_brands = set()
+    error_messages = []
 
-    for elem in model_output:
+    for elem in chunk_data:
         idx = elem["qsl_idx"]
         response = bytes.fromhex(elem["data"]).decode("utf-8")
-        ground_truth_item = original_data[idx]
+        ground_truth_item = dataset[idx]
         all_possible_brands.add(ground_truth_item["ground_truth_brand"])
         try:
             pred_item = ProductMetadata.model_validate_json(response)
@@ -245,14 +222,14 @@ def run_evaluation(random_seed: int, filename: FilePath,
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=rng.choice([True, False], size=1).tolist()[0],
+                is_secondhand=local_rng.choice([True, False], size=1).tolist()[0],
             )
-            logger.error(
-                "Response\n{}\n(for the sample at index {}) cannot be validated against"
-                " the expected schema. Overwriting this response into \n{}\n",
-                response,
-                idx,
-                pred_item,
+            error_messages.append(
+            (
+                f"Response\n{response}\n(for the sample at index {idx})"
+                f"cannot be validated against"
+                f" the expected schema. Overwriting this response into \n{pred_item}\n",
+            ),
             )
         category_dataset_pred_src.append(
             (pred_item.category, ground_truth_item["ground_truth_category"]),
@@ -268,35 +245,119 @@ def run_evaluation(random_seed: int, filename: FilePath,
         )
         # random category selection
         # Uniform distribution is the default
-        rand_cat = rng.choice(
+        rand_cat = local_rng.choice(
             ground_truth_item["potential_product_categories"])
         category_rand_pred_src.append(
             (rand_cat, ground_truth_item["ground_truth_category"]),
         )
         # random is_secondhand selection
-        rand_is_secondhand = rng.choice([True, False])
+        rand_is_secondhand = local_rng.choice([True, False])
         is_secondhand_rand_pred_src.append(
             (rand_is_secondhand,
              ground_truth_item["ground_truth_is_secondhand"]),
         )
 
+    return {
+            "num_unparsable_responses": num_unparsable_responses,
+          "error_messages": error_messages,
+            "category_dataset_pred_src": category_dataset_pred_src,
+            "category_rand_pred_src": category_rand_pred_src,
+            "is_secondhand_pred_src": is_secondhand_pred_src,
+           "is_secondhand_rand_pred_src": is_secondhand_rand_pred_src,
+           "brand_pred_src": brand_pred_src,
+          "all_possible_brands": list(all_possible_brands),
+        }
+
+
+
+def run_evaluation(random_seed: int, filename: FilePath,
+                   dataset: DatasetCLI) -> None:
+    """Main function to run the evaluation."""
+    master_rng = np.random.default_rng(seed=random_seed)
+    with Path.open(filename) as f:
+        model_output = json.load(f)
+
+    original_data = load_dataset(
+        dataset.repo_id,
+        token=dataset.token,
+        split="+".join(dataset.split),
+    )
+
+    # get number of available CPU and get chunk size
+    cpu_count = min(os.cpu_count() or 1, _MAX_JOBS)
+    chunk_size = max(len(model_output) // cpu_count, 1)
+    # Create chunks
+    output_chunks = [
+        model_output[i : i + chunk_size]
+        for i in range(0, len(model_output), chunk_size)
+    ]
+
+    # Generate Seeds
+    # One seed per chunk to ensure reproducibility.
+    # The master_rng generates these,
+    # so the whole run is deterministic based on `random_seed`.
+    chunk_seeds = master_rng.integers(0, 2**32, size=len(output_chunks))
+
+    # Zip them: Each task is ([model_out_1, ...], 12345)
+    tasks = zip(output_chunks, chunk_seeds, strict=False)
+
+    num_unparsable_responses = 0
+    err_messages = []
+    category_dataset_pred_src = []
+    category_rand_pred_src = []
+    is_secondhand_pred_src = []
+    is_secondhand_rand_pred_src = []
+    brand_pred_src = []
+    all_possible_brands = []
+
+    with ProcessPoolExecutor(
+        max_workers=cpu_count,
+        initializer=init_worker,
+        initargs=(original_data,),
+    ) as executor:
+        # Execute
+        chunk_results = list(executor.map(_process_chunk, tasks))
+
+    for chunk in chunk_results:
+        num_unparsable_responses += chunk["num_unparsable_responses"]
+        err_messages.extend(chunk["error_messages"])
+        category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
+        category_rand_pred_src.extend(chunk["category_rand_pred_src"])
+        is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
+        is_secondhand_rand_pred_src.extend(chunk["is_secondhand_rand_pred_src"])
+        brand_pred_src.extend(chunk["brand_pred_src"])
+        all_possible_brands.extend(chunk["all_possible_brands"])
+
+    for err in err_messages:
+        logger.error("{}", err)
+
     category_f1_score = calculate_hierarchical_f1(category_dataset_pred_src)
-    hiclass_f1_score = calculate_hiclass_f1(category_dataset_pred_src)
     is_secondhand_f1_score = calculate_secondhand_f1(is_secondhand_pred_src)
     brand_score = calculate_brand_f1_score(brand_pred_src)
 
     rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
-    rand_hiclass_f1_score = calculate_hiclass_f1(category_rand_pred_src)
+
     rand_is_seconhand_f1_score = calculate_secondhand_f1(
         is_secondhand_rand_pred_src)
+
+
+    all_brands_list = list(set(all_possible_brands))
+    random_brand_predictions = master_rng.choice(
+        all_brands_list,
+        size=len(model_output))
+
+    args_list = (
+        (pred, elem, original_data)
+        for pred, elem in zip(random_brand_predictions, model_output, strict=False)
+    )
+
+    with ProcessPoolExecutor() as executor:
+        rand_brand_data = list(executor.map(_process_chunk_rnd_brand,
+                                            args_list,
+                                            chunksize=chunk_size))
+
     rand_brand_score = calculate_brand_f1_score(
-        [
-            (
-                rng.choice(list(all_possible_brands)),
-                original_data[elem["qsl_idx"]]["ground_truth_brand"],
-            )
-            for elem in model_output
-        ],
+        rand_brand_data,
     )
 
     logger.info(
@@ -307,14 +368,12 @@ def run_evaluation(random_seed: int, filename: FilePath,
                 [
                     "From accuracy file",
                     category_f1_score,
-                    hiclass_f1_score,
                     brand_score,
                     is_secondhand_f1_score,
                 ],
                 [
                     "Random selection",
                     rand_cat_f1_score,
-                    rand_hiclass_f1_score,
                     rand_brand_score,
                     rand_is_seconhand_f1_score,
                 ],
@@ -322,7 +381,6 @@ def run_evaluation(random_seed: int, filename: FilePath,
             headers=[
                 "Results",
                 "Category hierarchical F1 Score",
-                "Category HiClass F1 Score",
                 "Brand F1 Score",
                 "Is_secondhand F1 Score",
             ],

From 9f3b52ed0a001f09a740745603acf09fc1a8c91e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 12 Dec 2025 20:07:54 +0000
Subject: [PATCH 12/39] [Automated Commit] Format Codebase

---
 .../evaluation.py                             | 43 +++++++++++--------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
index aaf95080f8..2eb3c7e1b5 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
@@ -30,6 +30,7 @@
 _WORKER_CONTEXT = {}
 _MAX_JOBS = 4
 
+
 def get_hierarchical_components(
     predicted_path: str,
     true_path: str,
@@ -112,6 +113,7 @@ def calculate_hierarchical_f1(
 
     return 0.0 if hp + hr == 0 else 2 * (hp * hr) / (hp + hr)
 
+
 def calculate_brand_f1_score(data: list[tuple[str, str]]) -> float:
     """Calculate the F1 score of brand field.
 
@@ -142,6 +144,7 @@ def calculate_brand_f1_score(data: list[tuple[str, str]]) -> float:
     # For 1-to-1 extraction, Accuracy = Recall = Micro F1
     return sum(matches) / len(matches)
 
+
 def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float:
     """Calculate F1 score of is_secondhand field.
 
@@ -169,6 +172,7 @@ def _process_chunk_rnd_brand(args: tuple[str, dict, dict]) -> tuple[str, str]:
     # We pass the specific data row needed, or the whole structure if efficient
     return (pred_brand, data_source[elem["qsl_idx"]]["ground_truth_brand"])
 
+
 def init_worker(dataset: dict) -> None:
     """Initialize worker data to process each chunk.
 
@@ -177,6 +181,7 @@ def init_worker(dataset: dict) -> None:
     """
     _WORKER_CONTEXT["dataset"] = dataset
 
+
 def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
     """Retrieve relevant information from each chunk of data.
 
@@ -222,14 +227,15 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=local_rng.choice([True, False], size=1).tolist()[0],
+                is_secondhand=local_rng.choice(
+                    [True, False], size=1).tolist()[0],
             )
             error_messages.append(
-            (
-                f"Response\n{response}\n(for the sample at index {idx})"
-                f"cannot be validated against"
-                f" the expected schema. Overwriting this response into \n{pred_item}\n",
-            ),
+                (
+                    f"Response\n{response}\n(for the sample at index {idx})"
+                    f"cannot be validated against"
+                    f" the expected schema. Overwriting this response into \n{pred_item}\n",
+                ),
             )
         category_dataset_pred_src.append(
             (pred_item.category, ground_truth_item["ground_truth_category"]),
@@ -258,16 +264,15 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
         )
 
     return {
-            "num_unparsable_responses": num_unparsable_responses,
-          "error_messages": error_messages,
-            "category_dataset_pred_src": category_dataset_pred_src,
-            "category_rand_pred_src": category_rand_pred_src,
-            "is_secondhand_pred_src": is_secondhand_pred_src,
-           "is_secondhand_rand_pred_src": is_secondhand_rand_pred_src,
-           "brand_pred_src": brand_pred_src,
-          "all_possible_brands": list(all_possible_brands),
-        }
-
+        "num_unparsable_responses": num_unparsable_responses,
+        "error_messages": error_messages,
+        "category_dataset_pred_src": category_dataset_pred_src,
+        "category_rand_pred_src": category_rand_pred_src,
+        "is_secondhand_pred_src": is_secondhand_pred_src,
+        "is_secondhand_rand_pred_src": is_secondhand_rand_pred_src,
+        "brand_pred_src": brand_pred_src,
+        "all_possible_brands": list(all_possible_brands),
+    }
 
 
 def run_evaluation(random_seed: int, filename: FilePath,
@@ -288,7 +293,7 @@ def run_evaluation(random_seed: int, filename: FilePath,
     chunk_size = max(len(model_output) // cpu_count, 1)
     # Create chunks
     output_chunks = [
-        model_output[i : i + chunk_size]
+        model_output[i: i + chunk_size]
         for i in range(0, len(model_output), chunk_size)
     ]
 
@@ -324,7 +329,8 @@ def run_evaluation(random_seed: int, filename: FilePath,
         category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
         category_rand_pred_src.extend(chunk["category_rand_pred_src"])
         is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
-        is_secondhand_rand_pred_src.extend(chunk["is_secondhand_rand_pred_src"])
+        is_secondhand_rand_pred_src.extend(
+            chunk["is_secondhand_rand_pred_src"])
         brand_pred_src.extend(chunk["brand_pred_src"])
         all_possible_brands.extend(chunk["all_possible_brands"])
 
@@ -340,7 +346,6 @@ def run_evaluation(random_seed: int, filename: FilePath,
     rand_is_seconhand_f1_score = calculate_secondhand_f1(
         is_secondhand_rand_pred_src)
 
-
     all_brands_list = list(set(all_possible_brands))
     random_brand_predictions = master_rng.choice(
         all_brands_list,

From 03429094dc75e8ad7af4d7bf768aacd5a63b01bd Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Fri, 12 Dec 2025 15:10:58 -0500
Subject: [PATCH 13/39] change default server_target_latency to 12

---
 multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
index 76ffd222c8..13847fe39a 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
@@ -156,7 +156,7 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
     request as necessary to achieve this value.
     """
 
-    server_target_latency: timedelta = timedelta(seconds=13)
+    server_target_latency: timedelta = timedelta(seconds=12)
     """Expected latency constraint for Server scenario. This is a constraint that we
     expect depending on the argument server_expected_qps. When server_expected_qps
     increases, we expect the latency to also increase. When server_expected_qps

From d10d634a30573bbc709850dd01ab8382ee92d3f8 Mon Sep 17 00:00:00 2001
From: John Calderon <jcalderon@nvidia.com>
Date: Fri, 12 Dec 2025 16:12:45 -0500
Subject: [PATCH 14/39] revert evaluation changeS

---
 .../evaluation.py                             | 207 ++++++------------
 1 file changed, 72 insertions(+), 135 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
index 2eb3c7e1b5..a63d16fd09 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
@@ -3,13 +3,12 @@
 from __future__ import annotations
 
 import json
-import os
-from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
 from datasets import load_dataset
+from hiclass.metrics import f1  # type: ignore[import-untyped]
 from loguru import logger
 from pydantic import ValidationError
 from rapidfuzz import fuzz  # type: ignore[import-untyped]
@@ -23,13 +22,11 @@
 
 from .schema import ProductMetadata
 
+_TRUE_CATEGORY_PAD = "<|__TRUE_CATEGORY_PAD__|>"
 _PRED_CATEGORY_PAD = "<|__PRED_CATEGORY_PAD__|>"
 _PRED_BRAND_PAD = "<|__PRED_BRAND_PAD__|>"
 _CATEGORY_SEPARATOR = " > "
 
-_WORKER_CONTEXT = {}
-_MAX_JOBS = 4
-
 
 def get_hierarchical_components(
     predicted_path: str,
@@ -162,42 +159,63 @@ def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float:
     return f1_score(y_src, y_pred)
 
 
-def _process_chunk_rnd_brand(args: tuple[str, dict, dict]) -> tuple[str, str]:
-    """Function to process only chunks for random brand predictions.
+def calculate_hiclass_f1(
+    data: list[tuple[str, str]],
+    separator: str = _CATEGORY_SEPARATOR,
+) -> float:
+    """Alt method to calculate hierarchical F1.
 
     Args:
-        args: Tuple containing
+        data: List of tuples of predicted and true values
+        separator: The separator used to split the paths into levels of the category.
+
+    Returs:
+        f1 score
     """
-    pred_brand, elem, data_source = args
-    # We pass the specific data row needed, or the whole structure if efficient
-    return (pred_brand, data_source[elem["qsl_idx"]]["ground_truth_brand"])
+    y_pred_raw = []
+    y_true_raw = []
 
+    for pred, src in data:
+        path1 = pred.split(separator)
+        path2 = src.split(separator)
 
-def init_worker(dataset: dict) -> None:
-    """Initialize worker data to process each chunk.
+        y_pred_raw.append(path1)
+        y_true_raw.append(path2)
 
-    Args:
-        dataset: huggingface dataset
-    """
-    _WORKER_CONTEXT["dataset"] = dataset
+    # 2. Find the global maximum length across ALL samples
+    # We check the longest path in both true and pred lists
+    max_len = max(len(p) for p in y_true_raw + y_pred_raw)
 
+    # 3. Pad all lists to the global max_len
+    for i in range(len(y_true_raw)):
+        # Pad Truth
+        pad_len_true = max_len - len(y_true_raw[i])
+        y_true_raw[i] += [_TRUE_CATEGORY_PAD] * pad_len_true
 
-def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
-    """Retrieve relevant information from each chunk of data.
+        # Pad Prediction
+        pad_len_pred = max_len - len(y_pred_raw[i])
+        y_pred_raw[i] += [_PRED_CATEGORY_PAD] * pad_len_pred
 
-    Args:
-        args: Tuple that contains chunk of data and seed
+    # 4. Convert to numpy arrays
+    y_true = np.array(y_true_raw)
+    y_pred = np.array(y_pred_raw)
+
+    # 5. Calculate Score
+    return f1(y_true, y_pred)
 
-    Returns:
-        Object with processed information
-    """
-    chunk_data, seed = args
 
-    # 1. Access the global dataset
-    dataset = _WORKER_CONTEXT["dataset"]
+def run_evaluation(random_seed: int, filename: FilePath,
+                   dataset: DatasetCLI) -> None:
+    """Main function to run the evaluation."""
+    rng = np.random.default_rng(seed=random_seed)
+    with Path.open(filename) as f:
+        model_output = json.load(f)
 
-    # 2. Create a local, reproducible RNG for this specific chunk
-    local_rng = np.random.default_rng(seed)
+    original_data = load_dataset(
+        dataset.repo_id,
+        token=dataset.token,
+        split="+".join(dataset.split),
+    )
 
     num_unparsable_responses = 0
     category_dataset_pred_src = []
@@ -205,13 +223,13 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
     is_secondhand_pred_src = []
     is_secondhand_rand_pred_src = []
     brand_pred_src = []
+
     all_possible_brands = set()
-    error_messages = []
 
-    for elem in chunk_data:
+    for elem in model_output:
         idx = elem["qsl_idx"]
         response = bytes.fromhex(elem["data"]).decode("utf-8")
-        ground_truth_item = dataset[idx]
+        ground_truth_item = original_data[idx]
         all_possible_brands.add(ground_truth_item["ground_truth_brand"])
         try:
             pred_item = ProductMetadata.model_validate_json(response)
@@ -227,15 +245,14 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=local_rng.choice(
-                    [True, False], size=1).tolist()[0],
+                is_secondhand=rng.choice([True, False], size=1).tolist()[0],
             )
-            error_messages.append(
-                (
-                    f"Response\n{response}\n(for the sample at index {idx})"
-                    f"cannot be validated against"
-                    f" the expected schema. Overwriting this response into \n{pred_item}\n",
-                ),
+            logger.error(
+                "Response\n{}\n(for the sample at index {}) cannot be validated against"
+                " the expected schema. Overwriting this response into \n{}\n",
+                response,
+                idx,
+                pred_item,
             )
         category_dataset_pred_src.append(
             (pred_item.category, ground_truth_item["ground_truth_category"]),
@@ -251,118 +268,35 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
         )
         # random category selection
         # Uniform distribution is the default
-        rand_cat = local_rng.choice(
+        rand_cat = rng.choice(
             ground_truth_item["potential_product_categories"])
         category_rand_pred_src.append(
             (rand_cat, ground_truth_item["ground_truth_category"]),
         )
         # random is_secondhand selection
-        rand_is_secondhand = local_rng.choice([True, False])
+        rand_is_secondhand = rng.choice([True, False])
         is_secondhand_rand_pred_src.append(
             (rand_is_secondhand,
              ground_truth_item["ground_truth_is_secondhand"]),
         )
 
-    return {
-        "num_unparsable_responses": num_unparsable_responses,
-        "error_messages": error_messages,
-        "category_dataset_pred_src": category_dataset_pred_src,
-        "category_rand_pred_src": category_rand_pred_src,
-        "is_secondhand_pred_src": is_secondhand_pred_src,
-        "is_secondhand_rand_pred_src": is_secondhand_rand_pred_src,
-        "brand_pred_src": brand_pred_src,
-        "all_possible_brands": list(all_possible_brands),
-    }
-
-
-def run_evaluation(random_seed: int, filename: FilePath,
-                   dataset: DatasetCLI) -> None:
-    """Main function to run the evaluation."""
-    master_rng = np.random.default_rng(seed=random_seed)
-    with Path.open(filename) as f:
-        model_output = json.load(f)
-
-    original_data = load_dataset(
-        dataset.repo_id,
-        token=dataset.token,
-        split="+".join(dataset.split),
-    )
-
-    # get number of available CPU and get chunk size
-    cpu_count = min(os.cpu_count() or 1, _MAX_JOBS)
-    chunk_size = max(len(model_output) // cpu_count, 1)
-    # Create chunks
-    output_chunks = [
-        model_output[i: i + chunk_size]
-        for i in range(0, len(model_output), chunk_size)
-    ]
-
-    # Generate Seeds
-    # One seed per chunk to ensure reproducibility.
-    # The master_rng generates these,
-    # so the whole run is deterministic based on `random_seed`.
-    chunk_seeds = master_rng.integers(0, 2**32, size=len(output_chunks))
-
-    # Zip them: Each task is ([model_out_1, ...], 12345)
-    tasks = zip(output_chunks, chunk_seeds, strict=False)
-
-    num_unparsable_responses = 0
-    err_messages = []
-    category_dataset_pred_src = []
-    category_rand_pred_src = []
-    is_secondhand_pred_src = []
-    is_secondhand_rand_pred_src = []
-    brand_pred_src = []
-    all_possible_brands = []
-
-    with ProcessPoolExecutor(
-        max_workers=cpu_count,
-        initializer=init_worker,
-        initargs=(original_data,),
-    ) as executor:
-        # Execute
-        chunk_results = list(executor.map(_process_chunk, tasks))
-
-    for chunk in chunk_results:
-        num_unparsable_responses += chunk["num_unparsable_responses"]
-        err_messages.extend(chunk["error_messages"])
-        category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
-        category_rand_pred_src.extend(chunk["category_rand_pred_src"])
-        is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
-        is_secondhand_rand_pred_src.extend(
-            chunk["is_secondhand_rand_pred_src"])
-        brand_pred_src.extend(chunk["brand_pred_src"])
-        all_possible_brands.extend(chunk["all_possible_brands"])
-
-    for err in err_messages:
-        logger.error("{}", err)
-
     category_f1_score = calculate_hierarchical_f1(category_dataset_pred_src)
+    hiclass_f1_score = calculate_hiclass_f1(category_dataset_pred_src)
     is_secondhand_f1_score = calculate_secondhand_f1(is_secondhand_pred_src)
     brand_score = calculate_brand_f1_score(brand_pred_src)
 
     rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
-
+    rand_hiclass_f1_score = calculate_hiclass_f1(category_rand_pred_src)
     rand_is_seconhand_f1_score = calculate_secondhand_f1(
         is_secondhand_rand_pred_src)
-
-    all_brands_list = list(set(all_possible_brands))
-    random_brand_predictions = master_rng.choice(
-        all_brands_list,
-        size=len(model_output))
-
-    args_list = (
-        (pred, elem, original_data)
-        for pred, elem in zip(random_brand_predictions, model_output, strict=False)
-    )
-
-    with ProcessPoolExecutor() as executor:
-        rand_brand_data = list(executor.map(_process_chunk_rnd_brand,
-                                            args_list,
-                                            chunksize=chunk_size))
-
     rand_brand_score = calculate_brand_f1_score(
-        rand_brand_data,
+        [
+            (
+                rng.choice(list(all_possible_brands)),
+                original_data[elem["qsl_idx"]]["ground_truth_brand"],
+            )
+            for elem in model_output
+        ],
     )
 
     logger.info(
@@ -373,12 +307,14 @@ def run_evaluation(random_seed: int, filename: FilePath,
                 [
                     "From accuracy file",
                     category_f1_score,
+                    hiclass_f1_score,
                     brand_score,
                     is_secondhand_f1_score,
                 ],
                 [
                     "Random selection",
                     rand_cat_f1_score,
+                    rand_hiclass_f1_score,
                     rand_brand_score,
                     rand_is_seconhand_f1_score,
                 ],
@@ -386,9 +322,10 @@ def run_evaluation(random_seed: int, filename: FilePath,
             headers=[
                 "Results",
                 "Category hierarchical F1 Score",
+                "Category HiClass F1 Score",
                 "Brand F1 Score",
                 "Is_secondhand F1 Score",
             ],
             tablefmt="fancy_grid",
         ),
-    )
+    )
\ No newline at end of file

From e75a34a0516cde88b31ed8d9e1ad096a1d6ea604 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 12 Dec 2025 21:13:24 +0000
Subject: [PATCH 15/39] [Automated Commit] Format Codebase

---
 .../vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
index a63d16fd09..2076bdbab8 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
@@ -328,4 +328,4 @@ def run_evaluation(random_seed: int, filename: FilePath,
             ],
             tablefmt="fancy_grid",
         ),
-    )
\ No newline at end of file
+    )

From 2209ae6059a3b5eb69f84fc5a425403682bf2948 Mon Sep 17 00:00:00 2001
From: Shang Wang <shangw@nvidia.com>
Date: Sat, 13 Dec 2025 21:12:32 -0800
Subject: [PATCH 16/39] update slurm script

---
 multimodal/vl2l/scripts/slurm/benchmark.sh |  5 ++--
 multimodal/vl2l/scripts/slurm/evaluate.sh  |  4 +--
 multimodal/vl2l/scripts/slurm/submit.sh    | 34 ++++++++++------------
 3 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/multimodal/vl2l/scripts/slurm/benchmark.sh b/multimodal/vl2l/scripts/slurm/benchmark.sh
index ba5d64f41b..2e2770e267 100644
--- a/multimodal/vl2l/scripts/slurm/benchmark.sh
+++ b/multimodal/vl2l/scripts/slurm/benchmark.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-#SBATCH --time=2:00:00
-#SBATCH --partition=batch_short
+#SBATCH --time=4:00:00
+#SBATCH --partition=batch
 #SBATCH --gres=gpu:8
 #SBATCH --tasks=1
 #SBATCH --nodes=1
@@ -21,7 +21,6 @@ srun \
     mlperf-inf-mm-vl2l benchmark vllm \
         --settings.test.scenario=${SCENARIO} \
         --settings.test.mode=${MODE} \
-        --dataset.token=${DATASET_TOKEN} \
         --vllm.model.repo_id=${MODEL_REPO_ID} \
         --vllm.cli=--async-scheduling \
         --vllm.cli=--max-model-len=32768 \
diff --git a/multimodal/vl2l/scripts/slurm/evaluate.sh b/multimodal/vl2l/scripts/slurm/evaluate.sh
index adc6a66265..4018a4545f 100644
--- a/multimodal/vl2l/scripts/slurm/evaluate.sh
+++ b/multimodal/vl2l/scripts/slurm/evaluate.sh
@@ -9,7 +9,8 @@
 #SBATCH --output=evaluate-slurm-output-%j.txt
 #SBATCH --error=evaluate-slurm-error-%j.txt
 
-export NVIDIA_VISIBLE_DEVICES=void
+set -eux
+set -p pipefail
 
 srun \
     --container-image=${CONTAINER_IMAGE} \
@@ -17,5 +18,4 @@ srun \
     --no-container-mount-home \
     --container-env=NVIDIA_VISIBLE_DEVICES \
     mlperf-inf-mm-vl2l evaluate \
-        --dataset.token=${DATASET_TOKEN} \
         --filename=${OUTPUT_CONTAINER_DIR}/${BENCHMARK_JOB_ID}/mlperf_log_accuracy.json
\ No newline at end of file
diff --git a/multimodal/vl2l/scripts/slurm/submit.sh b/multimodal/vl2l/scripts/slurm/submit.sh
index 418e6c65ce..5bc674c3a6 100644
--- a/multimodal/vl2l/scripts/slurm/submit.sh
+++ b/multimodal/vl2l/scripts/slurm/submit.sh
@@ -6,9 +6,6 @@ set -o pipefail
 DEFAULT_CONTAINER_IMAGE=""
 container_image=${DEFAULT_CONTAINER_IMAGE}
 
-DEFAULT_DATASET_TOKEN=""
-dataset_token=${DEFAULT_DATASET_TOKEN}
-
 DEFAULT_MODEL_REPO_ID=Qwen/Qwen3-VL-235B-A22B-Instruct
 model_repo_id=${DEFAULT_MODEL_REPO_ID}
 
@@ -39,7 +36,6 @@ Submit a benchmarking (and optionally, an evaluation) job(s) for the VL2L benchm
 
 Usage: ${BASH_SOURCE[0]}
     [-ci  | --container-image]     Container image to run the benchmark (default: ${DEFAULT_CONTAINER_IMAGE}).
-    [-dt  | --dataset-token]       Access token for the Shopify Global Catalogue dataset (default: ${DEFAULT_DATASET_TOKEN}).
     [-mri | --model-repo-id]       HuggingFace repo ID of the model to benchmark (default: ${DEFAULT_MODEL_REPO_ID}).
     [-s | --scenario]              Benchmark scenario (default: ${DEFAULT_SCENARIO}).
     [-m | --mode]                  Benchmark mode (default: ${DEFAULT_MODE}).
@@ -67,15 +63,6 @@ while [[ $# -gt 0 ]]; do
         container_image=${1#*=}
         shift
         ;;
-    -dt | --dataset-token)
-        dataset_token=$2
-        shift
-        shift
-        ;;
-    -dt=* | --dataset-token=*)
-        dataset_token=${1#*=}
-        shift
-        ;;
     -mri | --model-repo-id)
         model_repo_id=$2
         shift
@@ -161,10 +148,6 @@ if [[ -z "${container_image}" ]]; then
     _exit_with_help_msg "[ERROR] -ci or --container-image is required." 1
 fi
 
-if [[ -z "${dataset_token}" ]]; then
-    _exit_with_help_msg "[ERROR] -dt or --dataset-token is required." 1
-fi
-
 if [[ -z "${cache_host_dir}" ]]; then
     _exit_with_help_msg "[ERROR] -chd or --cache-host-dir is required." 1
 fi
@@ -187,17 +170,30 @@ output_container_dir=/outputs
 mkdir -p "${output_host_dir}"
 
 benchmark_job_id=$(
+    CACHE_HOST_DIR="${cache_host_dir}" \
+    CACHE_CONTAINER_DIR="${cache_container_dir}" \
+    OUTPUT_HOST_DIR="${output_host_dir}" \
+    OUTPUT_CONTAINER_DIR="${output_container_dir}" \
+    CONTAINER_IMAGE="${container_image}" \
+    SCENARIO="${scenario}" \
+    MODE="${mode}" \
+    MODEL_REPO_ID="${model_repo_id}" \
     sbatch --parsable \
-        --export=CACHE_HOST_DIR="${cache_host_dir}",CACHE_CONTAINER_DIR="${cache_container_dir}",OUTPUT_HOST_DIR="${output_host_dir}",OUTPUT_CONTAINER_DIR="${output_container_dir}",CONTAINER_IMAGE="${container_image}",SCENARIO="${scenario}",MODE="${mode}",DATASET_TOKEN="${dataset_token}",MODEL_REPO_ID="${model_repo_id}" \
         --account="${slurm_account}" \
         --partition="${benchmark_slurm_partition}" \
         benchmark.sh
 )
 
 if [[ "${mode}" == "accuracy_only" ]]; then
+    CACHE_HOST_DIR="${cache_host_dir}" \
+    CACHE_CONTAINER_DIR="${cache_container_dir}" \
+    OUTPUT_HOST_DIR="${output_host_dir}" \
+    OUTPUT_CONTAINER_DIR="${output_container_dir}" \
+    CONTAINER_IMAGE="${container_image}" \
+    BENCHMARK_JOB_ID="${benchmark_job_id}" \
+    NVIDIA_VISIBLE_DEVICES=void \
     sbatch \
         --dependency=afterok:"${benchmark_job_id}" \
-        --export=CACHE_HOST_DIR="${cache_host_dir}",CACHE_CONTAINER_DIR="${cache_container_dir}",OUTPUT_HOST_DIR="${output_host_dir}",OUTPUT_CONTAINER_DIR="${output_container_dir}",CONTAINER_IMAGE="${container_image}",DATASET_TOKEN="${dataset_token}",BENCHMARK_JOB_ID="${benchmark_job_id}" \
         --account="${slurm_account}" \
         --partition="${evaluate_slurm_partition}" \
         evaluate.sh

From 145014335aaa518d6c2e1316583e8b3ac97309d9 Mon Sep 17 00:00:00 2001
From: Shang Wang <shangw@nvidia.com>
Date: Sun, 14 Dec 2025 18:13:56 -0800
Subject: [PATCH 17/39] update slurm script

---
 multimodal/vl2l/scripts/slurm/benchmark.sh |  4 +--
 multimodal/vl2l/scripts/slurm/submit.sh    | 29 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/multimodal/vl2l/scripts/slurm/benchmark.sh b/multimodal/vl2l/scripts/slurm/benchmark.sh
index 2e2770e267..9dccc01724 100644
--- a/multimodal/vl2l/scripts/slurm/benchmark.sh
+++ b/multimodal/vl2l/scripts/slurm/benchmark.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 #SBATCH --time=4:00:00
 #SBATCH --partition=batch
-#SBATCH --gres=gpu:8
 #SBATCH --tasks=1
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
@@ -21,9 +20,10 @@ srun \
     mlperf-inf-mm-vl2l benchmark vllm \
         --settings.test.scenario=${SCENARIO} \
         --settings.test.mode=${MODE} \
+        --settings.test.server_expected_qps=${SERVER_EXPECTED_QPS} \
         --vllm.model.repo_id=${MODEL_REPO_ID} \
         --vllm.cli=--async-scheduling \
         --vllm.cli=--max-model-len=32768 \
         --vllm.cli=--limit-mm-per-prompt.video=0 \
-        --vllm.cli=--tensor-parallel-size=8 \
+        --vllm.cli=--tensor-parallel-size=${TENSOR_PARALLEL_SIZE} \
         --settings.logging.log_output.outdir=${OUTPUT_CONTAINER_DIR}/${SLURM_JOB_ID} 
\ No newline at end of file
diff --git a/multimodal/vl2l/scripts/slurm/submit.sh b/multimodal/vl2l/scripts/slurm/submit.sh
index 5bc674c3a6..e3356093d5 100644
--- a/multimodal/vl2l/scripts/slurm/submit.sh
+++ b/multimodal/vl2l/scripts/slurm/submit.sh
@@ -15,6 +15,12 @@ scenario=${DEFAULT_SCENARIO}
 DEFAULT_MODE=accuracy_only
 mode=${DEFAULT_MODE}
 
+DEFAULT_SERVER_EXPECTED_QPS=5
+server_expected_qps=${DEFAULT_SERVER_EXPECTED_QPS}
+
+DEFAULT_TENSOR_PARALLEL_SIZE=8
+tensor_parallel_size=${DEFAULT_TENSOR_PARALLEL_SIZE}
+
 DEFAULT_CACHE_HOST_DIR=""
 cache_host_dir=${DEFAULT_CACHE_HOST_DIR}
 
@@ -39,6 +45,8 @@ Usage: ${BASH_SOURCE[0]}
     [-mri | --model-repo-id]       HuggingFace repo ID of the model to benchmark (default: ${DEFAULT_MODEL_REPO_ID}).
     [-s | --scenario]              Benchmark scenario (default: ${DEFAULT_SCENARIO}).
     [-m | --mode]                  Benchmark mode (default: ${DEFAULT_MODE}).
+    [-seq | --server-expected-qps] The expected QPS for the server scenario (default: ${DEFAULT_SERVER_EXPECTED_QPS}).
+    [-tps | --tensor-parallel-size] Tensor parallelism size for the model deployment (default: ${DEFAULT_TENSOR_PARALLEL_SIZE}).
     [-chd | --cache-host-dir]      Host directory of the `.cache` directory to which HuggingFace will dump the dataset and the model checkpoint, and vLLM will dump compilation artifacts (default: ${DEFAULT_CACHE_HOST_DIR}).
     [-ohd | --output-host-dir]     Host directory to which the benchmark and evaluation results will be dumped (default: ${DEFAULT_OUTPUT_HOST_DIR}).
     [-sa | --slurm-account]        Slurm account for submitting the benchmark and evaluation jobs (default: ${DEFAULT_SLURM_ACCOUNT}).
@@ -90,6 +98,24 @@ while [[ $# -gt 0 ]]; do
         mode=${1#*=}
         shift
         ;;
+    -seq | --server-expected-qps)
+        server_expected_qps=$2
+        shift
+        shift
+        ;;
+    -seq=* | --server-expected-qps=*)
+        server_expected_qps=${1#*=}
+        shift
+        ;;
+    -tps | --tensor-parallel-size)
+        tensor_parallel_size=$2
+        shift
+        shift
+        ;;
+    -tps=* | --tensor-parallel-size=*)
+        tensor_parallel_size=${1#*=}
+        shift
+        ;;
     -chd | --cache-host-dir)
         cache_host_dir=$2
         shift
@@ -177,10 +203,13 @@ benchmark_job_id=$(
     CONTAINER_IMAGE="${container_image}" \
     SCENARIO="${scenario}" \
     MODE="${mode}" \
+    SERVER_EXPECTED_QPS="${server_expected_qps}" \
+    TENSOR_PARALLEL_SIZE="${tensor_parallel_size}" \
     MODEL_REPO_ID="${model_repo_id}" \
     sbatch --parsable \
         --account="${slurm_account}" \
         --partition="${benchmark_slurm_partition}" \
+        --gres=gpu:"${tensor_parallel_size}" \
         benchmark.sh
 )
 

From 6a5f17dc03b4981fdb746c01c818463fa7454055 Mon Sep 17 00:00:00 2001
From: John Calderon <jcalderon@nvidia.com>
Date: Mon, 15 Dec 2025 15:15:11 -0500
Subject: [PATCH 18/39] revert evaluation.py changes after analysing the
 discrepancy in is_secondhand f1 score

---
 .../evaluation.py                             | 210 +++++++++++-------
 1 file changed, 134 insertions(+), 76 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
index 2076bdbab8..b51ecd6044 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
@@ -3,12 +3,13 @@
 from __future__ import annotations
 
 import json
+import os
+from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
 from datasets import load_dataset
-from hiclass.metrics import f1  # type: ignore[import-untyped]
 from loguru import logger
 from pydantic import ValidationError
 from rapidfuzz import fuzz  # type: ignore[import-untyped]
@@ -22,11 +23,12 @@
 
 from .schema import ProductMetadata
 
-_TRUE_CATEGORY_PAD = "<|__TRUE_CATEGORY_PAD__|>"
 _PRED_CATEGORY_PAD = "<|__PRED_CATEGORY_PAD__|>"
 _PRED_BRAND_PAD = "<|__PRED_BRAND_PAD__|>"
 _CATEGORY_SEPARATOR = " > "
 
+_WORKER_CONTEXT = {}
+_MAX_JOBS = 4
 
 def get_hierarchical_components(
     predicted_path: str,
@@ -110,7 +112,6 @@ def calculate_hierarchical_f1(
 
     return 0.0 if hp + hr == 0 else 2 * (hp * hr) / (hp + hr)
 
-
 def calculate_brand_f1_score(data: list[tuple[str, str]]) -> float:
     """Calculate the F1 score of brand field.
 
@@ -141,7 +142,6 @@ def calculate_brand_f1_score(data: list[tuple[str, str]]) -> float:
     # For 1-to-1 extraction, Accuracy = Recall = Micro F1
     return sum(matches) / len(matches)
 
-
 def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float:
     """Calculate F1 score of is_secondhand field.
 
@@ -159,63 +159,40 @@ def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float:
     return f1_score(y_src, y_pred)
 
 
-def calculate_hiclass_f1(
-    data: list[tuple[str, str]],
-    separator: str = _CATEGORY_SEPARATOR,
-) -> float:
-    """Alt method to calculate hierarchical F1.
+def _process_chunk_rnd_brand(args: tuple[str, dict, dict]) -> tuple[str, str]:
+    """Function to process only chunks for random brand predictions.
 
     Args:
-        data: List of tuples of predicted and true values
-        separator: The separator used to split the paths into levels of the category.
-
-    Returs:
-        f1 score
+        args: Tuple containing
     """
-    y_pred_raw = []
-    y_true_raw = []
+    pred_brand, elem, data_source = args
+    # We pass the specific data row needed, or the whole structure if efficient
+    return (pred_brand, data_source[elem["qsl_idx"]]["ground_truth_brand"])
 
-    for pred, src in data:
-        path1 = pred.split(separator)
-        path2 = src.split(separator)
-
-        y_pred_raw.append(path1)
-        y_true_raw.append(path2)
-
-    # 2. Find the global maximum length across ALL samples
-    # We check the longest path in both true and pred lists
-    max_len = max(len(p) for p in y_true_raw + y_pred_raw)
-
-    # 3. Pad all lists to the global max_len
-    for i in range(len(y_true_raw)):
-        # Pad Truth
-        pad_len_true = max_len - len(y_true_raw[i])
-        y_true_raw[i] += [_TRUE_CATEGORY_PAD] * pad_len_true
+def init_worker(dataset: dict) -> None:
+    """Initialize worker data to process each chunk.
 
-        # Pad Prediction
-        pad_len_pred = max_len - len(y_pred_raw[i])
-        y_pred_raw[i] += [_PRED_CATEGORY_PAD] * pad_len_pred
+    Args:
+        dataset: huggingface dataset
+    """
+    _WORKER_CONTEXT["dataset"] = dataset
 
-    # 4. Convert to numpy arrays
-    y_true = np.array(y_true_raw)
-    y_pred = np.array(y_pred_raw)
+def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
+    """Retrieve relevant information from each chunk of data.
 
-    # 5. Calculate Score
-    return f1(y_true, y_pred)
+    Args:
+        args: Tuple that contains chunk of data and seed
 
+    Returns:
+        Object with processed information
+    """
+    chunk_data, seed = args
 
-def run_evaluation(random_seed: int, filename: FilePath,
-                   dataset: DatasetCLI) -> None:
-    """Main function to run the evaluation."""
-    rng = np.random.default_rng(seed=random_seed)
-    with Path.open(filename) as f:
-        model_output = json.load(f)
+    # 1. Access the global dataset
+    dataset = _WORKER_CONTEXT["dataset"]
 
-    original_data = load_dataset(
-        dataset.repo_id,
-        token=dataset.token,
-        split="+".join(dataset.split),
-    )
+    # 2. Create a local, reproducible RNG for this specific chunk
+    local_rng = np.random.default_rng(seed)
 
     num_unparsable_responses = 0
     category_dataset_pred_src = []
@@ -223,13 +200,13 @@ def run_evaluation(random_seed: int, filename: FilePath,
     is_secondhand_pred_src = []
     is_secondhand_rand_pred_src = []
     brand_pred_src = []
-
     all_possible_brands = set()
+    error_messages = []
 
-    for elem in model_output:
+    for elem in chunk_data:
         idx = elem["qsl_idx"]
         response = bytes.fromhex(elem["data"]).decode("utf-8")
-        ground_truth_item = original_data[idx]
+        ground_truth_item = dataset[idx]
         all_possible_brands.add(ground_truth_item["ground_truth_brand"])
         try:
             pred_item = ProductMetadata.model_validate_json(response)
@@ -245,14 +222,14 @@ def run_evaluation(random_seed: int, filename: FilePath,
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=rng.choice([True, False], size=1).tolist()[0],
+                is_secondhand=local_rng.choice([True, False], size=1).tolist()[0],
             )
-            logger.error(
-                "Response\n{}\n(for the sample at index {}) cannot be validated against"
-                " the expected schema. Overwriting this response into \n{}\n",
-                response,
-                idx,
-                pred_item,
+            error_messages.append(
+            (
+                f"Response\n{response}\n(for the sample at index {idx})"
+                f"cannot be validated against"
+                f" the expected schema. Overwriting this response into \n{pred_item}\n",
+            ),
             )
         category_dataset_pred_src.append(
             (pred_item.category, ground_truth_item["ground_truth_category"]),
@@ -268,35 +245,119 @@ def run_evaluation(random_seed: int, filename: FilePath,
         )
         # random category selection
         # Uniform distribution is the default
-        rand_cat = rng.choice(
+        rand_cat = local_rng.choice(
             ground_truth_item["potential_product_categories"])
         category_rand_pred_src.append(
             (rand_cat, ground_truth_item["ground_truth_category"]),
         )
         # random is_secondhand selection
-        rand_is_secondhand = rng.choice([True, False])
+        rand_is_secondhand = local_rng.choice([True, False])
         is_secondhand_rand_pred_src.append(
             (rand_is_secondhand,
              ground_truth_item["ground_truth_is_secondhand"]),
         )
 
+    return {
+            "num_unparsable_responses": num_unparsable_responses,
+          "error_messages": error_messages,
+            "category_dataset_pred_src": category_dataset_pred_src,
+            "category_rand_pred_src": category_rand_pred_src,
+            "is_secondhand_pred_src": is_secondhand_pred_src,
+           "is_secondhand_rand_pred_src": is_secondhand_rand_pred_src,
+           "brand_pred_src": brand_pred_src,
+          "all_possible_brands": list(all_possible_brands),
+        }
+
+
+
+def run_evaluation(random_seed: int, filename: FilePath,
+                   dataset: DatasetCLI) -> None:
+    """Main function to run the evaluation."""
+    master_rng = np.random.default_rng(seed=random_seed)
+    with Path.open(filename) as f:
+        model_output = json.load(f)
+
+    original_data = load_dataset(
+        dataset.repo_id,
+        token=dataset.token,
+        split="+".join(dataset.split),
+    )
+
+    # get number of available CPU and get chunk size
+    cpu_count = min(os.cpu_count() or 1, _MAX_JOBS)
+    chunk_size = max(len(model_output) // cpu_count, 1)
+    # Create chunks
+    output_chunks = [
+        model_output[i : i + chunk_size]
+        for i in range(0, len(model_output), chunk_size)
+    ]
+
+    # Generate Seeds
+    # One seed per chunk to ensure reproducibility.
+    # The master_rng generates these,
+    # so the whole run is deterministic based on `random_seed`.
+    chunk_seeds = master_rng.integers(0, 2**32, size=len(output_chunks))
+
+    # Zip them: Each task is ([model_out_1, ...], 12345)
+    tasks = zip(output_chunks, chunk_seeds, strict=False)
+
+    num_unparsable_responses = 0
+    err_messages = []
+    category_dataset_pred_src = []
+    category_rand_pred_src = []
+    is_secondhand_pred_src = []
+    is_secondhand_rand_pred_src = []
+    brand_pred_src = []
+    all_possible_brands = []
+
+    with ProcessPoolExecutor(
+        max_workers=cpu_count,
+        initializer=init_worker,
+        initargs=(original_data,),
+    ) as executor:
+        # Execute
+        chunk_results = list(executor.map(_process_chunk, tasks))
+
+    for chunk in chunk_results:
+        num_unparsable_responses += chunk["num_unparsable_responses"]
+        err_messages.extend(chunk["error_messages"])
+        category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
+        category_rand_pred_src.extend(chunk["category_rand_pred_src"])
+        is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
+        is_secondhand_rand_pred_src.extend(chunk["is_secondhand_rand_pred_src"])
+        brand_pred_src.extend(chunk["brand_pred_src"])
+        all_possible_brands.extend(chunk["all_possible_brands"])
+
+    for err in err_messages:
+        logger.error("{}", err)
+
     category_f1_score = calculate_hierarchical_f1(category_dataset_pred_src)
-    hiclass_f1_score = calculate_hiclass_f1(category_dataset_pred_src)
     is_secondhand_f1_score = calculate_secondhand_f1(is_secondhand_pred_src)
     brand_score = calculate_brand_f1_score(brand_pred_src)
 
     rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
-    rand_hiclass_f1_score = calculate_hiclass_f1(category_rand_pred_src)
+
     rand_is_seconhand_f1_score = calculate_secondhand_f1(
         is_secondhand_rand_pred_src)
+
+
+    all_brands_list = list(set(all_possible_brands))
+    random_brand_predictions = master_rng.choice(
+        all_brands_list,
+        size=len(model_output))
+
+    args_list = (
+        (pred, elem, original_data)
+        for pred, elem in zip(random_brand_predictions, model_output, strict=False)
+    )
+
+    with ProcessPoolExecutor() as executor:
+        rand_brand_data = list(executor.map(_process_chunk_rnd_brand,
+                                            args_list,
+                                            chunksize=chunk_size))
+
     rand_brand_score = calculate_brand_f1_score(
-        [
-            (
-                rng.choice(list(all_possible_brands)),
-                original_data[elem["qsl_idx"]]["ground_truth_brand"],
-            )
-            for elem in model_output
-        ],
+        rand_brand_data,
     )
 
     logger.info(
@@ -307,14 +368,12 @@ def run_evaluation(random_seed: int, filename: FilePath,
                 [
                     "From accuracy file",
                     category_f1_score,
-                    hiclass_f1_score,
                     brand_score,
                     is_secondhand_f1_score,
                 ],
                 [
                     "Random selection",
                     rand_cat_f1_score,
-                    rand_hiclass_f1_score,
                     rand_brand_score,
                     rand_is_seconhand_f1_score,
                 ],
@@ -322,10 +381,9 @@ def run_evaluation(random_seed: int, filename: FilePath,
             headers=[
                 "Results",
                 "Category hierarchical F1 Score",
-                "Category HiClass F1 Score",
                 "Brand F1 Score",
                 "Is_secondhand F1 Score",
             ],
             tablefmt="fancy_grid",
         ),
-    )
+    )
\ No newline at end of file

From d5d2cc8352b5cb49aa20db4fd8afc11c0c3b6174 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 15 Dec 2025 20:15:55 +0000
Subject: [PATCH 19/39] [Automated Commit] Format Codebase

---
 .../evaluation.py                             | 45 ++++++++++---------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
index b51ecd6044..2eb3c7e1b5 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
@@ -30,6 +30,7 @@
 _WORKER_CONTEXT = {}
 _MAX_JOBS = 4
 
+
 def get_hierarchical_components(
     predicted_path: str,
     true_path: str,
@@ -112,6 +113,7 @@ def calculate_hierarchical_f1(
 
     return 0.0 if hp + hr == 0 else 2 * (hp * hr) / (hp + hr)
 
+
 def calculate_brand_f1_score(data: list[tuple[str, str]]) -> float:
     """Calculate the F1 score of brand field.
 
@@ -142,6 +144,7 @@ def calculate_brand_f1_score(data: list[tuple[str, str]]) -> float:
     # For 1-to-1 extraction, Accuracy = Recall = Micro F1
     return sum(matches) / len(matches)
 
+
 def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float:
     """Calculate F1 score of is_secondhand field.
 
@@ -169,6 +172,7 @@ def _process_chunk_rnd_brand(args: tuple[str, dict, dict]) -> tuple[str, str]:
     # We pass the specific data row needed, or the whole structure if efficient
     return (pred_brand, data_source[elem["qsl_idx"]]["ground_truth_brand"])
 
+
 def init_worker(dataset: dict) -> None:
     """Initialize worker data to process each chunk.
 
@@ -177,6 +181,7 @@ def init_worker(dataset: dict) -> None:
     """
     _WORKER_CONTEXT["dataset"] = dataset
 
+
 def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
     """Retrieve relevant information from each chunk of data.
 
@@ -222,14 +227,15 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=local_rng.choice([True, False], size=1).tolist()[0],
+                is_secondhand=local_rng.choice(
+                    [True, False], size=1).tolist()[0],
             )
             error_messages.append(
-            (
-                f"Response\n{response}\n(for the sample at index {idx})"
-                f"cannot be validated against"
-                f" the expected schema. Overwriting this response into \n{pred_item}\n",
-            ),
+                (
+                    f"Response\n{response}\n(for the sample at index {idx})"
+                    f"cannot be validated against"
+                    f" the expected schema. Overwriting this response into \n{pred_item}\n",
+                ),
             )
         category_dataset_pred_src.append(
             (pred_item.category, ground_truth_item["ground_truth_category"]),
@@ -258,16 +264,15 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
         )
 
     return {
-            "num_unparsable_responses": num_unparsable_responses,
-          "error_messages": error_messages,
-            "category_dataset_pred_src": category_dataset_pred_src,
-            "category_rand_pred_src": category_rand_pred_src,
-            "is_secondhand_pred_src": is_secondhand_pred_src,
-           "is_secondhand_rand_pred_src": is_secondhand_rand_pred_src,
-           "brand_pred_src": brand_pred_src,
-          "all_possible_brands": list(all_possible_brands),
-        }
-
+        "num_unparsable_responses": num_unparsable_responses,
+        "error_messages": error_messages,
+        "category_dataset_pred_src": category_dataset_pred_src,
+        "category_rand_pred_src": category_rand_pred_src,
+        "is_secondhand_pred_src": is_secondhand_pred_src,
+        "is_secondhand_rand_pred_src": is_secondhand_rand_pred_src,
+        "brand_pred_src": brand_pred_src,
+        "all_possible_brands": list(all_possible_brands),
+    }
 
 
 def run_evaluation(random_seed: int, filename: FilePath,
@@ -288,7 +293,7 @@ def run_evaluation(random_seed: int, filename: FilePath,
     chunk_size = max(len(model_output) // cpu_count, 1)
     # Create chunks
     output_chunks = [
-        model_output[i : i + chunk_size]
+        model_output[i: i + chunk_size]
         for i in range(0, len(model_output), chunk_size)
     ]
 
@@ -324,7 +329,8 @@ def run_evaluation(random_seed: int, filename: FilePath,
         category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
         category_rand_pred_src.extend(chunk["category_rand_pred_src"])
         is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
-        is_secondhand_rand_pred_src.extend(chunk["is_secondhand_rand_pred_src"])
+        is_secondhand_rand_pred_src.extend(
+            chunk["is_secondhand_rand_pred_src"])
         brand_pred_src.extend(chunk["brand_pred_src"])
         all_possible_brands.extend(chunk["all_possible_brands"])
 
@@ -340,7 +346,6 @@ def run_evaluation(random_seed: int, filename: FilePath,
     rand_is_seconhand_f1_score = calculate_secondhand_f1(
         is_secondhand_rand_pred_src)
 
-
     all_brands_list = list(set(all_possible_brands))
     random_brand_predictions = master_rng.choice(
         all_brands_list,
@@ -386,4 +391,4 @@ def run_evaluation(random_seed: int, filename: FilePath,
             ],
             tablefmt="fancy_grid",
         ),
-    )
\ No newline at end of file
+    )

From f72d82dfab3d82c1f5a0284563cf0b4caac64014 Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Tue, 16 Dec 2025 12:53:15 -0500
Subject: [PATCH 20/39] linting

---
 multimodal/vl2l/scripts/slurm/benchmark.sh    | 18 ++++-----
 multimodal/vl2l/scripts/slurm/evaluate.sh     |  6 +--
 multimodal/vl2l/scripts/slurm/submit.sh       |  2 +-
 .../deploy.py                                 |  2 +-
 .../evaluation.py                             | 40 +++++++++----------
 .../mlperf_inference_multimodal_vl2l/task.py  | 17 +++-----
 6 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/multimodal/vl2l/scripts/slurm/benchmark.sh b/multimodal/vl2l/scripts/slurm/benchmark.sh
index 9dccc01724..3c2a118b07 100644
--- a/multimodal/vl2l/scripts/slurm/benchmark.sh
+++ b/multimodal/vl2l/scripts/slurm/benchmark.sh
@@ -11,19 +11,19 @@
 set -eux
 set -o pipefail
 
-mkdir -p ${OUTPUT_HOST_DIR}/${SLURM_JOB_ID}
+mkdir -p "${OUTPUT_HOST_DIR}"/"${SLURM_JOB_ID}"
 
 srun \
-    --container-image=${CONTAINER_IMAGE} \
-    --container-mounts=${CACHE_HOST_DIR}:${CACHE_CONTAINER_DIR},${OUTPUT_HOST_DIR}:${OUTPUT_CONTAINER_DIR} \
+    --container-image="${CONTAINER_IMAGE}" \
+    --container-mounts="${CACHE_HOST_DIR}":"${CACHE_CONTAINER_DIR}","${OUTPUT_HOST_DIR}":"${OUTPUT_CONTAINER_DIR}" \
     --no-container-mount-home \
     mlperf-inf-mm-vl2l benchmark vllm \
-        --settings.test.scenario=${SCENARIO} \
-        --settings.test.mode=${MODE} \
-        --settings.test.server_expected_qps=${SERVER_EXPECTED_QPS} \
-        --vllm.model.repo_id=${MODEL_REPO_ID} \
+        --settings.test.scenario="${SCENARIO}" \
+        --settings.test.mode="${MODE}" \
+        --settings.test.server_expected_qps="${SERVER_EXPECTED_QPS}" \
+        --vllm.model.repo_id="${MODEL_REPO_ID}" \
         --vllm.cli=--async-scheduling \
         --vllm.cli=--max-model-len=32768 \
         --vllm.cli=--limit-mm-per-prompt.video=0 \
-        --vllm.cli=--tensor-parallel-size=${TENSOR_PARALLEL_SIZE} \
-        --settings.logging.log_output.outdir=${OUTPUT_CONTAINER_DIR}/${SLURM_JOB_ID} 
\ No newline at end of file
+        --vllm.cli=--tensor-parallel-size="${TENSOR_PARALLEL_SIZE}" \
+        --settings.logging.log_output.outdir="${OUTPUT_CONTAINER_DIR}"/"${SLURM_JOB_ID}" 
\ No newline at end of file
diff --git a/multimodal/vl2l/scripts/slurm/evaluate.sh b/multimodal/vl2l/scripts/slurm/evaluate.sh
index 4018a4545f..4ae554b61d 100644
--- a/multimodal/vl2l/scripts/slurm/evaluate.sh
+++ b/multimodal/vl2l/scripts/slurm/evaluate.sh
@@ -13,9 +13,9 @@ set -eux
 set -p pipefail
 
 srun \
-    --container-image=${CONTAINER_IMAGE} \
-    --container-mounts=${CACHE_HOST_DIR}:${CACHE_CONTAINER_DIR},${OUTPUT_HOST_DIR}:${OUTPUT_CONTAINER_DIR} \
+    --container-image="${CONTAINER_IMAGE}" \
+    --container-mounts="${CACHE_HOST_DIR}":"${CACHE_CONTAINER_DIR}","${OUTPUT_HOST_DIR}":"${OUTPUT_CONTAINER_DIR}" \
     --no-container-mount-home \
     --container-env=NVIDIA_VISIBLE_DEVICES \
     mlperf-inf-mm-vl2l evaluate \
-        --filename=${OUTPUT_CONTAINER_DIR}/${BENCHMARK_JOB_ID}/mlperf_log_accuracy.json
\ No newline at end of file
+        --filename="${OUTPUT_CONTAINER_DIR}"/"${BENCHMARK_JOB_ID}"/mlperf_log_accuracy.json
\ No newline at end of file
diff --git a/multimodal/vl2l/scripts/slurm/submit.sh b/multimodal/vl2l/scripts/slurm/submit.sh
index e3356093d5..34af7bd89f 100644
--- a/multimodal/vl2l/scripts/slurm/submit.sh
+++ b/multimodal/vl2l/scripts/slurm/submit.sh
@@ -47,7 +47,7 @@ Usage: ${BASH_SOURCE[0]}
     [-m | --mode]                  Benchmark mode (default: ${DEFAULT_MODE}).
     [-seq | --server-expected-qps] The expected QPS for the server scenario (default: ${DEFAULT_SERVER_EXPECTED_QPS}).
     [-tps | --tensor-parallel-size] Tensor parallelism size for the model deployment (default: ${DEFAULT_TENSOR_PARALLEL_SIZE}).
-    [-chd | --cache-host-dir]      Host directory of the `.cache` directory to which HuggingFace will dump the dataset and the model checkpoint, and vLLM will dump compilation artifacts (default: ${DEFAULT_CACHE_HOST_DIR}).
+    [-chd | --cache-host-dir]      Host directory of the ".cache" directory to which HuggingFace will dump the dataset and the model checkpoint, and vLLM will dump compilation artifacts (default: ${DEFAULT_CACHE_HOST_DIR}).
     [-ohd | --output-host-dir]     Host directory to which the benchmark and evaluation results will be dumped (default: ${DEFAULT_OUTPUT_HOST_DIR}).
     [-sa | --slurm-account]        Slurm account for submitting the benchmark and evaluation jobs (default: ${DEFAULT_SLURM_ACCOUNT}).
     [-bsp | --benchmark-slurm-partition] Slurm partition for submitting the benchmarking job; usually a partition with nodes that have GPUs (default: ${DEFAULT_BENCHMARK_SLURM_PARTITION}).
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
index 9d10378dff..ccd9e4efa9 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
@@ -7,7 +7,6 @@
 import time
 from abc import ABC, abstractmethod
 from datetime import timedelta  # noqa: TC003
-from pathlib import Path
 from typing import TYPE_CHECKING, Self
 from urllib.parse import urlparse
 
@@ -17,6 +16,7 @@
 from .log import get_log_file_path
 
 if TYPE_CHECKING:
+    from pathlib import Path
     from types import TracebackType
 
     from .schema import EndpointToDeploy, Settings, VllmEndpoint
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
index 2eb3c7e1b5..74732e7cc7 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
@@ -17,6 +17,8 @@
 from tabulate import tabulate
 
 if TYPE_CHECKING:
+    from typing import Any
+
     from pydantic import FilePath
 
     from .cli import Dataset as DatasetCLI
@@ -60,8 +62,7 @@ def get_hierarchical_components(
     intersection_count = 0
 
     # Iterate through the paths simultaneously
-    for pred_cat, true_cat in zip(
-            predicted_categories, true_categories, strict=False):
+    for pred_cat, true_cat in zip(predicted_categories, true_categories, strict=False):
         if pred_cat == true_cat:
             intersection_count += 1
         else:
@@ -182,7 +183,7 @@ def init_worker(dataset: dict) -> None:
     _WORKER_CONTEXT["dataset"] = dataset
 
 
-def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
+def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
     """Retrieve relevant information from each chunk of data.
 
     Args:
@@ -227,14 +228,13 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=local_rng.choice(
-                    [True, False], size=1).tolist()[0],
+                is_secondhand=local_rng.choice([True, False], size=1).tolist()[0],
             )
             error_messages.append(
                 (
                     f"Response\n{response}\n(for the sample at index {idx})"
-                    f"cannot be validated against"
-                    f" the expected schema. Overwriting this response into \n{pred_item}\n",
+                    f"cannot be validated against the expected schema. "
+                    f"Overwriting this response into \n{pred_item}\n",
                 ),
             )
         category_dataset_pred_src.append(
@@ -251,16 +251,14 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
         )
         # random category selection
         # Uniform distribution is the default
-        rand_cat = local_rng.choice(
-            ground_truth_item["potential_product_categories"])
+        rand_cat = local_rng.choice(ground_truth_item["potential_product_categories"])
         category_rand_pred_src.append(
             (rand_cat, ground_truth_item["ground_truth_category"]),
         )
         # random is_secondhand selection
         rand_is_secondhand = local_rng.choice([True, False])
         is_secondhand_rand_pred_src.append(
-            (rand_is_secondhand,
-             ground_truth_item["ground_truth_is_secondhand"]),
+            (rand_is_secondhand, ground_truth_item["ground_truth_is_secondhand"]),
         )
 
     return {
@@ -275,8 +273,7 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, any]:
     }
 
 
-def run_evaluation(random_seed: int, filename: FilePath,
-                   dataset: DatasetCLI) -> None:
+def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) -> None:
     """Main function to run the evaluation."""
     master_rng = np.random.default_rng(seed=random_seed)
     with Path.open(filename) as f:
@@ -293,7 +290,7 @@ def run_evaluation(random_seed: int, filename: FilePath,
     chunk_size = max(len(model_output) // cpu_count, 1)
     # Create chunks
     output_chunks = [
-        model_output[i: i + chunk_size]
+        model_output[i : i + chunk_size]
         for i in range(0, len(model_output), chunk_size)
     ]
 
@@ -329,8 +326,7 @@ def run_evaluation(random_seed: int, filename: FilePath,
         category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
         category_rand_pred_src.extend(chunk["category_rand_pred_src"])
         is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
-        is_secondhand_rand_pred_src.extend(
-            chunk["is_secondhand_rand_pred_src"])
+        is_secondhand_rand_pred_src.extend(chunk["is_secondhand_rand_pred_src"])
         brand_pred_src.extend(chunk["brand_pred_src"])
         all_possible_brands.extend(chunk["all_possible_brands"])
 
@@ -343,13 +339,13 @@ def run_evaluation(random_seed: int, filename: FilePath,
 
     rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
 
-    rand_is_seconhand_f1_score = calculate_secondhand_f1(
-        is_secondhand_rand_pred_src)
+    rand_is_seconhand_f1_score = calculate_secondhand_f1(is_secondhand_rand_pred_src)
 
     all_brands_list = list(set(all_possible_brands))
     random_brand_predictions = master_rng.choice(
         all_brands_list,
-        size=len(model_output))
+        size=len(model_output),
+    )
 
     args_list = (
         (pred, elem, original_data)
@@ -357,9 +353,9 @@ def run_evaluation(random_seed: int, filename: FilePath,
     )
 
     with ProcessPoolExecutor() as executor:
-        rand_brand_data = list(executor.map(_process_chunk_rnd_brand,
-                                            args_list,
-                                            chunksize=chunk_size))
+        rand_brand_data = list(
+            executor.map(_process_chunk_rnd_brand, args_list, chunksize=chunk_size),
+        )
 
     rand_brand_score = calculate_brand_f1_score(
         rand_brand_data,
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
index 86edb1cb74..6dfa0ec91d 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
@@ -67,8 +67,7 @@ def __init__(
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
             http_client=DefaultAioHttpClient(
-                timeout=httpx.Timeout(
-                    timeout=request_timeout_seconds, connect=5.0),
+                timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0),
             ),
             api_key=endpoint.api_key,
             timeout=request_timeout_seconds,
@@ -103,7 +102,7 @@ async def _cancel_all_tasks() -> None:
                 _cancel_all_tasks(),
                 self.event_loop,
             ).result(timeout=5.0)
-        except Exception as e:
+        except Exception as e:  # noqa: BLE001
             logger.trace("Error cancelling tasks during cleanup: {}", e)
 
         # Try to close the OpenAI client gracefully
@@ -112,7 +111,7 @@ async def _cancel_all_tasks() -> None:
                 self.openai_api_client.close(),
                 self.event_loop,
             ).result(timeout=5.0)
-        except Exception as e:
+        except Exception as e:  # noqa: BLE001
             logger.trace("Error closing OpenAI client during cleanup: {}", e)
 
         # Stop the event loop and join the thread
@@ -188,9 +187,7 @@ def estimated_num_performance_samples(self) -> int:
         """
         estimation_indices = random.sample(
             range(self.total_num_samples),
-            k=min(
-                MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES,
-                self.total_num_samples),
+            k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples),
         )
         estimation_samples = [
             self.formulate_loaded_sample(
@@ -277,8 +274,7 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             _unload_samples_from_ram,
         )
 
-    async def _query_endpoint_async_batch(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         try:
             sample = self.loaded_samples[query_sample.index]
@@ -355,8 +351,7 @@ async def _query_endpoint_async_batch(
                 ],
             )
 
-    async def _query_endpoint_async_stream(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         ttft_set = False
         try:

From 0e731ed015662e4243af5af59de6fe167c5956eb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 16 Dec 2025 17:53:56 +0000
Subject: [PATCH 21/39] [Automated Commit] Format Codebase

---
 .../deploy.py                                 |  7 +++--
 .../evaluation.py                             | 28 +++++++++++++------
 .../mlperf_inference_multimodal_vl2l/task.py  | 13 ++++++---
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
index ccd9e4efa9..b965325b89 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
@@ -208,7 +208,9 @@ def _startup(self) -> None:
         """Start the local process."""
         cmd = self._build_command()
         logger.info("Starting local process with command: {}", cmd)
-        logger.info("Starting local process with environment variables: {}", os.environ)
+        logger.info(
+            "Starting local process with environment variables: {}",
+            os.environ)
 
         # Start the server
         process = subprocess.Popen(  # noqa: S603
@@ -251,7 +253,8 @@ def _shutdown(self) -> None:
         # Try graceful termination first
         self._process.terminate()
         try:
-            self._process.wait(timeout=self.endpoint.shutdown_timeout.total_seconds())
+            self._process.wait(
+                timeout=self.endpoint.shutdown_timeout.total_seconds())
             logger.info("Local process terminated gracefully")
         except subprocess.TimeoutExpired:
             logger.warning(
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
index 74732e7cc7..3f99787008 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
@@ -62,7 +62,8 @@ def get_hierarchical_components(
     intersection_count = 0
 
     # Iterate through the paths simultaneously
-    for pred_cat, true_cat in zip(predicted_categories, true_categories, strict=False):
+    for pred_cat, true_cat in zip(
+            predicted_categories, true_categories, strict=False):
         if pred_cat == true_cat:
             intersection_count += 1
         else:
@@ -228,7 +229,8 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=local_rng.choice([True, False], size=1).tolist()[0],
+                is_secondhand=local_rng.choice(
+                    [True, False], size=1).tolist()[0],
             )
             error_messages.append(
                 (
@@ -251,14 +253,16 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
         )
         # random category selection
         # Uniform distribution is the default
-        rand_cat = local_rng.choice(ground_truth_item["potential_product_categories"])
+        rand_cat = local_rng.choice(
+            ground_truth_item["potential_product_categories"])
         category_rand_pred_src.append(
             (rand_cat, ground_truth_item["ground_truth_category"]),
         )
         # random is_secondhand selection
         rand_is_secondhand = local_rng.choice([True, False])
         is_secondhand_rand_pred_src.append(
-            (rand_is_secondhand, ground_truth_item["ground_truth_is_secondhand"]),
+            (rand_is_secondhand,
+             ground_truth_item["ground_truth_is_secondhand"]),
         )
 
     return {
@@ -273,7 +277,8 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
     }
 
 
-def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) -> None:
+def run_evaluation(random_seed: int, filename: FilePath,
+                   dataset: DatasetCLI) -> None:
     """Main function to run the evaluation."""
     master_rng = np.random.default_rng(seed=random_seed)
     with Path.open(filename) as f:
@@ -290,7 +295,7 @@ def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) ->
     chunk_size = max(len(model_output) // cpu_count, 1)
     # Create chunks
     output_chunks = [
-        model_output[i : i + chunk_size]
+        model_output[i: i + chunk_size]
         for i in range(0, len(model_output), chunk_size)
     ]
 
@@ -326,7 +331,8 @@ def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) ->
         category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
         category_rand_pred_src.extend(chunk["category_rand_pred_src"])
         is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
-        is_secondhand_rand_pred_src.extend(chunk["is_secondhand_rand_pred_src"])
+        is_secondhand_rand_pred_src.extend(
+            chunk["is_secondhand_rand_pred_src"])
         brand_pred_src.extend(chunk["brand_pred_src"])
         all_possible_brands.extend(chunk["all_possible_brands"])
 
@@ -339,7 +345,8 @@ def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) ->
 
     rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
 
-    rand_is_seconhand_f1_score = calculate_secondhand_f1(is_secondhand_rand_pred_src)
+    rand_is_seconhand_f1_score = calculate_secondhand_f1(
+        is_secondhand_rand_pred_src)
 
     all_brands_list = list(set(all_possible_brands))
     random_brand_predictions = master_rng.choice(
@@ -354,7 +361,10 @@ def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) ->
 
     with ProcessPoolExecutor() as executor:
         rand_brand_data = list(
-            executor.map(_process_chunk_rnd_brand, args_list, chunksize=chunk_size),
+            executor.map(
+                _process_chunk_rnd_brand,
+                args_list,
+                chunksize=chunk_size),
         )
 
     rand_brand_score = calculate_brand_f1_score(
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
index 6dfa0ec91d..6fab0b6409 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
@@ -67,7 +67,8 @@ def __init__(
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
             http_client=DefaultAioHttpClient(
-                timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0),
+                timeout=httpx.Timeout(
+                    timeout=request_timeout_seconds, connect=5.0),
             ),
             api_key=endpoint.api_key,
             timeout=request_timeout_seconds,
@@ -187,7 +188,9 @@ def estimated_num_performance_samples(self) -> int:
         """
         estimation_indices = random.sample(
             range(self.total_num_samples),
-            k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples),
+            k=min(
+                MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES,
+                self.total_num_samples),
         )
         estimation_samples = [
             self.formulate_loaded_sample(
@@ -274,7 +277,8 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             _unload_samples_from_ram,
         )
 
-    async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_batch(
+            self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         try:
             sample = self.loaded_samples[query_sample.index]
@@ -351,7 +355,8 @@ async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> Non
                 ],
             )
 
-    async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_stream(
+            self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         ttft_set = False
         try:

From 4771f130ce5e030ff8102e1e3f020ecdb46c3739 Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Tue, 16 Dec 2025 13:21:56 -0500
Subject: [PATCH 22/39] lock in model and dataset SHA

---
 multimodal/vl2l/README.md                     | 11 ++++++
 multimodal/vl2l/docker/vllm-cuda.Dockerfile   |  2 +-
 .../deploy.py                                 |  5 +++
 .../schema.py                                 | 35 +++++++++++--------
 4 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md
index 923f00481c..f3c3b075ee 100644
--- a/multimodal/vl2l/README.md
+++ b/multimodal/vl2l/README.md
@@ -210,6 +210,17 @@ bash submit.sh --help
 > example scripts to the specific settings for the Slurm cluster that you are going
 > to use, before you try to launch any jobs.
 
+## Reference Implementation Specification
+
+- v6.0 Round
+  - vLLM version: [v0.12.0](https://github.com/vllm-project/vllm/releases/tag/v0.12.0)
+  - Model:
+    - [Qwen/Qwen3-VL-235B-A22B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct)
+    - Commit SHA: [710c13861be6c466e66de3f484069440b8f31389](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct/tree/710c13861be6c466e66de3f484069440b8f31389)
+  - Dataset:
+    - [Shopify/product-catalogue](https://huggingface.co/datasets/Shopify/product-catalogue)
+    - Commit SHA: [d5c517c509f5aca99053897ef1de797d6d7e5aa5](https://huggingface.co/datasets/Shopify/product-catalogue/tree/d5c517c509f5aca99053897ef1de797d6d7e5aa5)
+
 ## Developer Guide
 
 ### Linting
diff --git a/multimodal/vl2l/docker/vllm-cuda.Dockerfile b/multimodal/vl2l/docker/vllm-cuda.Dockerfile
index 0c7597ce76..756cb05a3a 100644
--- a/multimodal/vl2l/docker/vllm-cuda.Dockerfile
+++ b/multimodal/vl2l/docker/vllm-cuda.Dockerfile
@@ -45,7 +45,7 @@
 #
 # ============================================================================
 
-ARG BASE_IMAGE_URL=vllm/vllm-openai:nightly
+ARG BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0
 FROM ${BASE_IMAGE_URL}
 
 # MLPERF_INF_MM_VL2L_INSTALL_URL can be either:
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
index ccd9e4efa9..de523a4994 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
@@ -297,12 +297,17 @@ def _build_command(self) -> list[str]:
             "vllm",
             "serve",
             self.endpoint.model.repo_id,
+            "--revision",
+            self.endpoint.model.revision,
             "--host",
             host,
             "--port",
             str(port),
         ]
 
+        if self.endpoint.model.token:
+            cmd.extend(["--hf-token", self.endpoint.model.token])
+
         # Add API key if provided
         if self.endpoint.api_key:
             cmd.extend(["--api-key", self.endpoint.api_key])
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
index ff8a5c1951..8ad2794dec 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
@@ -214,8 +214,7 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float |
-                        str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -241,12 +240,9 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.server_target_latency_ns = round(
             self.server_target_latency.total_seconds() * 1e9,
         )
-        settings.ttft_latency = round(
-            self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(
-            self.server_tpot_latency.total_seconds() * 1e9)
-        settings.min_duration_ms = round(
-            self.min_duration.total_seconds() * 1000)
+        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
+        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
         settings.performance_sample_count_override = (
             self.performance_sample_count_override
@@ -339,6 +335,12 @@ class Model(BaseModelWithAttributeDescriptionsFromDocstrings):
     repo_id: str = "Qwen/Qwen3-VL-235B-A22B-Instruct"
     """The HuggingFace repository ID of the model."""
 
+    token: str | None = None
+    """The token to access the HuggingFace repository of the model."""
+
+    revision: str = "710c13861be6c466e66de3f484069440b8f31389"
+    """The revision of the model."""
+
 
 class Dataset(BaseModelWithAttributeDescriptionsFromDocstrings):
     """Specifies a dataset on HuggingFace."""
@@ -349,10 +351,8 @@ class Dataset(BaseModelWithAttributeDescriptionsFromDocstrings):
     token: str | None = None
     """The token to access the HuggingFace repository of the dataset."""
 
-    revision: str | None = None
-    """The revision of the dataset. If not provided, the default revision (i.e., usually
-    `main`) will be used.
-    """
+    revision: str = "d5c517c509f5aca99053897ef1de797d6d7e5aa5"
+    """The revision of the dataset."""
 
     split: list[str] = ["train", "test"]
     """Dataset splits to use for the benchmark, e.g., "train" and "test". You can add
@@ -455,7 +455,13 @@ class BlacklistedVllmCliFlagError(ValueError):
     """The exception raised when a blacklisted vllm CLI flag is encountered."""
 
     BLACKLIST: ClassVar[list[str]] = [
-        "--model", "--host", "--port", "--api-key"]
+        "--model",
+        "--revision",
+        "--host",
+        "--port",
+        "--hf-token",
+        "--api-key",
+    ]
 
     def __init__(self, flag: str) -> None:
         """Initialize the exception."""
@@ -508,6 +514,5 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(
-                    message["content"])  # type: ignore[arg-type]
+                message["content"] = list(message["content"])  # type: ignore[arg-type]
         return messages

From d4d6f78a6e04f241d792e47a3550fae11a106839 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 16 Dec 2025 18:22:40 +0000
Subject: [PATCH 23/39] [Automated Commit] Format Codebase

---
 .../mlperf_inference_multimodal_vl2l/schema.py    | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
index 8ad2794dec..8c47263dd8 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
@@ -214,7 +214,8 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float |
+                        str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -240,9 +241,12 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.server_target_latency_ns = round(
             self.server_target_latency.total_seconds() * 1e9,
         )
-        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
-        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
+        settings.ttft_latency = round(
+            self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(
+            self.server_tpot_latency.total_seconds() * 1e9)
+        settings.min_duration_ms = round(
+            self.min_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
         settings.performance_sample_count_override = (
             self.performance_sample_count_override
@@ -514,5 +518,6 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(message["content"])  # type: ignore[arg-type]
+                message["content"] = list(
+                    message["content"])  # type: ignore[arg-type]
         return messages

From c0d0925df3cc150c0141bff5e5815c59c1b4e84b Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Tue, 16 Dec 2025 13:32:43 -0500
Subject: [PATCH 24/39] Specify model quality target and server target latency
 in the README

---
 multimodal/vl2l/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md
index f3c3b075ee..fcc57c2680 100644
--- a/multimodal/vl2l/README.md
+++ b/multimodal/vl2l/README.md
@@ -220,6 +220,12 @@ bash submit.sh --help
   - Dataset:
     - [Shopify/product-catalogue](https://huggingface.co/datasets/Shopify/product-catalogue)
     - Commit SHA: [d5c517c509f5aca99053897ef1de797d6d7e5aa5](https://huggingface.co/datasets/Shopify/product-catalogue/tree/d5c517c509f5aca99053897ef1de797d6d7e5aa5)
+  - Constraint:
+    - Model quality:
+      - Category Hierarchical F1 Score >= `0.7824`.
+    - Server Scenario:
+      - Target latency percentile = `0.99`.
+      - Target latency <= 12 seconds.
 
 ## Developer Guide
 

From 7dabbfec847c839f1050d687c9cd54e923e2c7ca Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Wed, 17 Dec 2025 21:37:05 -0500
Subject: [PATCH 25/39] Update loadgen/mlperf.conf

---
 loadgen/mlperf.conf | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 1b825514bd..1681fda64f 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -26,6 +26,7 @@ rgat.*.performance_sample_count_override = 788379
 pointpainting.*.performance_sample_count_override = 1024
 deepseek-r1.*.performance_sample_count_override = 4388
 whisper.*.performance_sample_count_override = 1633
+qwen3-vl-235b-a22b.*.performance_sample_count_override = 48289
 # set to 0 to let entire sample set to be performance sample
 3d-unet.*.performance_sample_count_override = 0
 
@@ -67,7 +68,7 @@ llama3_1-8b-edge.*.sample_concatenate_permutation = 1
 llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
 deepseek-r1.*.sample_concatenate_permutation = 1
 whisper.*.sample_concatenate_permutation = 1
-
+qwen3-vl-235b-a22b.*.sample_concatenate_permutation = 1
 *.Server.target_latency = 10
 *.Server.target_latency_percentile = 99
 *.Server.target_duration = 0
@@ -91,7 +92,9 @@ llama3_1-8b-edge.*.use_token_latencies = 1
 llama3_1-8b-interactive.*.use_token_latencies = 1
 deepseek-r1.*.use_token_latencies = 1
 whisper.*.use_token_latencies = 1
-
+# For the VLM benchmark, the model response is relatively short, therefore we track 
+# end-to-end latency instead of token latencies.
+qwen3-vl-235b-a22b.*.use_token_latencies = 0
 # gptj benchmark infers token latencies
 gptj.*.infer_token_latencies = 1
 gptj.*.token_latency_scaling_factor = 69
@@ -132,6 +135,10 @@ deepseek-r1.Server.target_latency = 0
 deepseek-r1.Server.ttft_latency = 2000
 deepseek-r1.Server.tpot_latency = 80
 
+qwen3-vl-235b-a22b.Server.target_latency = 12000
+qwen3-vl-235b-a22b.Server.ttft_latency = 0
+qwen3-vl-235b-a22b.Server.tpot_latency = 0
+
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
 
@@ -156,6 +163,7 @@ mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 deepseek-r1.Offline.min_query_count = 4388
 whisper.Offline.min_query_count = 1633
+qwen3-vl-235b-a22b.Offline.min_query_count = 48289
 
 # These fields should be defined and overridden by user.conf.
 *.SingleStream.target_latency = 10

From 423cea46a8069b1d1623dc6329d479b90b358f04 Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Thu, 18 Dec 2025 03:54:42 -0500
Subject: [PATCH 26/39] aligning TestSettings'C++ code with its python binding

---
 loadgen/bindings/python_api.cc | 16 ++++++++++------
 loadgen/test_settings.h        |  8 ++++----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc
index 96396dab92..67f2550375 100644
--- a/loadgen/bindings/python_api.cc
+++ b/loadgen/bindings/python_api.cc
@@ -312,6 +312,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::server_num_issue_query_threads)
       .def_readwrite("offline_expected_qps",
                      &TestSettings::offline_expected_qps)
+      .def_readwrite("sample_concatenate_permutation",
+                     &TestSettings::sample_concatenate_permutation)
       .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
       .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms)
       .def_readwrite("min_query_count", &TestSettings::min_query_count)
@@ -324,6 +326,14 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::accuracy_log_rng_seed)
       .def_readwrite("accuracy_log_probability",
                      &TestSettings::accuracy_log_probability)
+      .def_readwrite("accuracy_log_sampling_target",
+                     &TestSettings::accuracy_log_sampling_target)
+      .def_readwrite("test05", &TestSettings::test05)
+      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
+      .def_readwrite("test05_sample_index_rng_seed",
+                     &TestSettings::test05_sample_index_rng_seed)
+      .def_readwrite("test05_schedule_rng_seed",
+                     &TestSettings::test05_schedule_rng_seed) 
       .def_readwrite("print_timestamps", &TestSettings::print_timestamps)
       .def_readwrite("performance_issue_unique",
                      &TestSettings::performance_issue_unique)
@@ -333,12 +343,6 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::performance_issue_same_index)
       .def_readwrite("performance_sample_count_override",
                      &TestSettings::performance_sample_count_override)
-      .def_readwrite("test05", &TestSettings::test05)
-      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
-      .def_readwrite("test05_sample_index_rng_seed",
-                     &TestSettings::test05_sample_index_rng_seed)
-      .def_readwrite("test05_schedule_rng_seed",
-                     &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
       .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
       .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index 584d073bb8..2e092e721d 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -234,10 +234,6 @@ struct TestSettings {
   uint64_t test05_qsl_rng_seed = 0;
   uint64_t test05_sample_index_rng_seed = 0;
   uint64_t test05_schedule_rng_seed = 0;
-
-  /// \brief Load mlperf parameter config from file.
-  int FromConfig(const std::string &path, const std::string &model,
-                 const std::string &scenario, int conf_type = 1);
   /**@}*/
 
   // ==================================
@@ -272,6 +268,10 @@ struct TestSettings {
   bool infer_token_latencies = false;
   uint64_t token_latency_scaling_factor;
   /**@}*/
+
+  /// \brief Load mlperf parameter config from file.
+  int FromConfig(const std::string &path, const std::string &model,
+                 const std::string &scenario, int conf_type = 1);
 };
 
 ///

From 817f0e889fcce0234314c57bb72b6869a10224bc Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 18 Dec 2025 08:56:12 +0000
Subject: [PATCH 27/39] [Automated Commit] Format Codebase

---
 loadgen/bindings/python_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc
index 67f2550375..4e72f542ed 100644
--- a/loadgen/bindings/python_api.cc
+++ b/loadgen/bindings/python_api.cc
@@ -333,7 +333,7 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
       .def_readwrite("test05_sample_index_rng_seed",
                      &TestSettings::test05_sample_index_rng_seed)
       .def_readwrite("test05_schedule_rng_seed",
-                     &TestSettings::test05_schedule_rng_seed) 
+                     &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("print_timestamps", &TestSettings::print_timestamps)
       .def_readwrite("performance_issue_unique",
                      &TestSettings::performance_issue_unique)

From 9d3b36bfd8b418ed37f82768816647003cca3da2 Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Thu, 18 Dec 2025 18:02:18 -0500
Subject: [PATCH 28/39] remove ttft and tpot from mlperf.conf

---
 loadgen/mlperf.conf | 2 --
 1 file changed, 2 deletions(-)

diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 1681fda64f..d21a73a47d 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -136,8 +136,6 @@ deepseek-r1.Server.ttft_latency = 2000
 deepseek-r1.Server.tpot_latency = 80
 
 qwen3-vl-235b-a22b.Server.target_latency = 12000
-qwen3-vl-235b-a22b.Server.ttft_latency = 0
-qwen3-vl-235b-a22b.Server.tpot_latency = 0
 
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000

From 95f4179253d27db6da290b4674c235d96bfd790f Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Thu, 18 Dec 2025 18:04:35 -0500
Subject: [PATCH 29/39] Enable CLI to take in user.conf

---
 multimodal/vl2l/example_user.conf             |   7 +
 .../mlperf_inference_multimodal_vl2l/cli.py   |   6 +-
 .../schema.py                                 | 352 ++++++++++++++++--
 3 files changed, 320 insertions(+), 45 deletions(-)
 create mode 100644 multimodal/vl2l/example_user.conf

diff --git a/multimodal/vl2l/example_user.conf b/multimodal/vl2l/example_user.conf
new file mode 100644
index 0000000000..615c92fe67
--- /dev/null
+++ b/multimodal/vl2l/example_user.conf
@@ -0,0 +1,7 @@
+*.Offline.target_qps = 80.4816666667
+*.Offline.min_duration = 600000
+*.Offline.min_query_count = 48289
+
+*.Server.target_qps = 5.0
+*.Server.min_duration = 600000
+*.Server.min_query_count = 48289
\ No newline at end of file
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
index b487fd51e3..59ff457d75 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
@@ -81,13 +81,11 @@ def _run_benchmark(
     random_seed: int,
 ) -> None:
     """Run the VL2L benchmark."""
+    test_settings, log_settings = settings.to_lgtype()
     logger.info("Running VL2L benchmark with settings: {}", settings)
     logger.info("Running VL2L benchmark with dataset: {}", dataset)
-    logger.info(
-        "Running VL2L benchmark with OpenAI API endpoint: {}",
-        endpoint)
+    logger.info("Running VL2L benchmark with OpenAI API endpoint: {}", endpoint)
     logger.info("Running VL2L benchmark with random seed: {}", random_seed)
-    test_settings, log_settings = settings.to_lgtype()
     task = ShopifyGlobalCatalogue(
         dataset=dataset,
         endpoint=endpoint,
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
index 8c47263dd8..8ca2646cf3 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
@@ -8,6 +8,7 @@
 from typing import Annotated, ClassVar, Self
 
 import mlperf_loadgen as lg
+from loguru import logger
 from openai.types import ResponseFormatJSONSchema
 from openai.types.chat import ChatCompletionMessageParam
 from pydantic import (
@@ -15,6 +16,7 @@
     ConfigDict,
     DirectoryPath,
     Field,
+    FilePath,
     NonNegativeInt,
     field_validator,
     model_validator,
@@ -43,11 +45,22 @@ def to_lgtype(self) -> lg.TestScenario:
             case _:
                 raise UnknownTestScenarioValueError(self)
 
+    @staticmethod
+    def from_lgtype(lgtype: lg.TestScenario) -> TestScenario:
+        """Convert the LoadGen's test scenario to the TestScenario schema."""
+        match lgtype:
+            case lg.TestScenario.Server:
+                return TestScenario.SERVER
+            case lg.TestScenario.Offline:
+                return TestScenario.OFFLINE
+            case _:
+                raise UnknownTestScenarioValueError(lgtype)
+
 
 class UnknownTestScenarioValueError(ValueError):
     """The exception raised when an unknown test scenario is encountered."""
 
-    def __init__(self, test_scenario: TestScenario) -> None:
+    def __init__(self, test_scenario: TestScenario | lg.TestScenario) -> None:
         """Initialize the exception."""
         super().__init__(f"Unknown test scenario: {test_scenario}")
 
@@ -71,11 +84,22 @@ def to_lgtype(self) -> lg.TestMode:
             case _:
                 raise UnknownTestModeValueError(self)
 
+    @staticmethod
+    def from_lgtype(lgtype: lg.TestMode) -> TestMode:
+        """Convert the LoadGen's test mode to the TestMode schema."""
+        match lgtype:
+            case lg.TestMode.PerformanceOnly:
+                return TestMode.PERFORMANCE_ONLY
+            case lg.TestMode.AccuracyOnly:
+                return TestMode.ACCURACY_ONLY
+            case _:
+                raise UnknownTestModeValueError(lgtype)
+
 
 class UnknownTestModeValueError(ValueError):
     """The exception raised when an unknown test mode is encountered."""
 
-    def __init__(self, test_mode: TestMode) -> None:
+    def __init__(self, test_mode: TestMode | lg.TestMode) -> None:
         """Initialize the exception."""
         super().__init__(f"Unknown test mode: {test_mode}")
 
@@ -138,52 +162,161 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
     evaluation.
     """
 
-    offline_expected_qps: float = _DEFAULT_OFFLINE_EXPECTED_QPS
-    """The expected QPS for the offline scenario."""
-
-    # sample_concatenate_permutation: bool = True  # noqa: ERA001
-    # """Affects the order in which the samples of the dataset are chosen.
-    # If `False`, it concatenates a single permutation of the dataset (or part
-    # of it depending on `performance_sample_count_override`) several times up to the
-    # number of samples requested.
-    # If `True`, it concatenates a multiple permutation of the dataset (or a
-    # part of it depending on `performance_sample_count_override`) several times
-    # up to the number of samples requested.
-    # """
-
-    server_expected_qps: float = 5
-    """The expected QPS for the server scenario. Loadgen will try to send as many
-    request as necessary to achieve this value.
+    """Server-specific settings"""
+
+    server_target_qps: float = 5
+    """The average QPS of the poisson distribution. Note: This field is used as a
+    FindPeakPerformance's lower bound. When you run FindPeakPerformanceMode, you should
+    make sure that this value satisfies performance constraints.
     """
 
     server_target_latency: timedelta = timedelta(seconds=12)
-    """Expected latency constraint for Server scenario. This is a constraint that we
-    expect depending on the argument server_expected_qps. When server_expected_qps
-    increases, we expect the latency to also increase. When server_expected_qps
-    decreases, we expect the latency to also decrease.
+    """The latency constraint for the Server scenario."""
+
+    server_target_latency_percentile: float = 0.99
+    """The latency percentile for server mode. This value is combined with
+    server_target_latency to determine if a run is valid.
     """
 
-    server_ttft_latency: timedelta = timedelta(seconds=1)
-    """Time to First Token (TTFT) latency constraint result validation (used when
-    use_token_latencies is enabled).
+    server_coalesce_queries: bool = False
+    """If this flag is set to True, LoadGen will combine samples from
+    multiple queries into a single query if their scheduled issue times have
+    passed.
     """
 
-    server_tpot_latency: timedelta = timedelta(seconds=1)
-    """Time per Output Token (TPOT) latency constraint result validation (used when
-    use_token_latencies is enabled).
+    server_find_peak_qps_decimals_of_precision: int = 1
+    """The decimal places of QPS precision used to terminate
+    FindPeakPerformance mode.
+    """
+
+    server_find_peak_qps_boundary_step_size: float = 1
+    """The step size (as a fraction of the QPS) used to widen the lower and
+    upper bounds to find the initial boundaries of binary search.
+    """
+
+    server_max_async_queries: int = 0
+    """The maximum number of outstanding queries to allow before earlying out from a
+    performance run. Useful for performance tuning and speeding up the
+    FindPeakPerformance mode.
+    """
+
+    server_num_issue_query_threads: int = 0
+    """The number of issue query threads that will be registered and used
+    to call SUT's IssueQuery(). If this is 0, the same thread calling
+    StartTest() will be used to call IssueQuery(). See also
+    mlperf::RegisterIssueQueryThread().
+    """
+
+    """Offline-specific settings"""
+
+    offline_expected_qps: float = _DEFAULT_OFFLINE_EXPECTED_QPS
+    """Specifies the QPS the SUT expects to hit for the offline load.
+    The LoadGen generates 10% more queries than it thinks it needs to meet
+    the minimum test duration.
     """
 
+    sample_concatenate_permutation: bool = True
+    """Affects the order in which the samples of the dataset are chosen.
+    If False, it concatenates a single permutation of the dataset (or part
+    of it depending on performance_sample_count_override) several times up to the
+    number of samples requested.
+    If True, it concatenates a multiple permutation of the dataset (or a
+    part of it depending on `performance_sample_count_override`) several times
+    up to the number of samples requested.
+    """
+
+    """Test duration settings"""
+
     min_duration: timedelta = _DEFAULT_MIN_DURATION
     """The minimum testing duration (in seconds or ISO 8601 format like `PT5S`). The
     benchmark runs until this value has been met.
     """
 
+    max_duration: timedelta = timedelta(seconds=0)
+    """The maximum testing duration (in seconds or ISO 8601 format like `PT5S`). The
+    benchmark will exit before this value has been met. 0 means infinity.
+    """
+
     min_query_count: int = _DEFAULT_DATASET_SIZE
     """The minimum testing query count. The benchmark runs until this value has been
     met. If min_query_count is less than the total number of samples in the dataset,
     only the first min_query_count samples will be used during testing.
     """
 
+    max_query_count: int = 0
+    """The maximum testing query count. The benchmark will exit before this value has
+    been met. 0 means infinity.
+    """
+
+    """Random number generation settings"""
+
+    qsl_rng_seed: int = 0
+    """Affects which subset of samples from the QSL are chosen for
+    the performance sample set and accuracy sample sets."""
+
+    sample_index_rng_seed: int = 0
+    """Affects the order in which samples from the performance set will
+    be included in queries."""
+
+    schedule_rng_seed: int = 0
+    """Affects the poisson arrival process of the Server scenario.
+    Different seeds will appear to "jitter" the queries
+    differently in time, but should not affect the average issued QPS.
+    """
+
+    accuracy_log_rng_seed: int = 0
+    """Affects which samples have their query returns logged to the
+    accuracy log in performance mode."""
+
+    accuracy_log_probability: float = 0.0
+    """The probability of the query response of a sample being logged to the
+    accuracy log in performance mode."""
+
+    accuracy_log_sampling_target: int = 0
+    """The target number of samples that will have their results printed to
+    accuracy log in performance mode for compliance testing."""
+
+    """Test05 settings"""
+
+    test05: bool = False
+    """Whether or not to run test05."""
+
+    test05_qsl_rng_seed: int = 0
+    """Test05 seed for which subset of samples from the QSL are chosen for
+    the performance sample set and accuracy sample sets."""
+
+    test05_sample_index_rng_seed: int = 0
+    """Test05 seed for the order in which samples from the performance set will
+    be included in queries."""
+
+    test05_schedule_rng_seed: int = 0
+    """Test05 seed for the poisson arrival process of the Server scenario.
+    Different seeds will appear to "jitter" the queries
+    differently in time, but should not affect the average issued QPS.
+    """
+
+    """Performance Sample modifiers"""
+
+    print_timestamps: bool = False
+    """Prints measurement interval start and stop timestamps to stdout
+    for the purpose of comparison against an external timer."""
+
+    performance_issue_unique: bool = False
+    """Allows issuing only unique queries in Performance mode of any
+    scenario. This can be used to send non-repeat & hence unique
+    samples to SUT.
+    """
+
+    performance_issue_same: bool = False
+    """If True, the same query is chosen repeatedley for Inference.
+    In offline scenario, the query is filled with the same sample.
+    """
+
+    performance_issue_same_index: int = 0
+    """Offset to control which sample is repeated in
+    performance_issue_same mode. Value should be within [0, performance_sample_count).
+    """
+
     performance_sample_count_override: Annotated[
         NonNegativeInt,
         Field(
@@ -206,16 +339,34 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
     `server_tpot_latency` as the constraint.
     """
 
+    server_ttft_latency: timedelta = timedelta(milliseconds=100)
+    """Time to First Token (TTFT) latency constraint result validation (used when
+    use_token_latencies is enabled).
+    """
+
+    server_tpot_latency: timedelta = timedelta(milliseconds=100)
+    """Time per Output Token (TPOT) latency constraint result validation (used when
+    use_token_latencies is enabled).
+    """
+
+    infer_token_latencies: bool = False
+    """Infer token latencies from the response time."""
+
+    token_latency_scaling_factor: int = 1
+    """Only used when infer_token_latencies is enabled. The scaling factor inferring
+    token latencies from the response time.
+    """
+
     @field_validator(
         "server_target_latency",
+        "min_duration",
+        "max_duration",
         "server_ttft_latency",
         "server_tpot_latency",
-        "min_duration",
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float |
-                        str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -236,24 +387,109 @@ def to_lgtype(self) -> lg.TestSettings:
         settings = lg.TestSettings()
         settings.scenario = self.scenario.to_lgtype()
         settings.mode = self.mode.to_lgtype()
-        settings.offline_expected_qps = self.offline_expected_qps
-        settings.server_target_qps = self.server_expected_qps
+
+        # Server-specific settings
+        settings.server_target_qps = self.server_target_qps
         settings.server_target_latency_ns = round(
             self.server_target_latency.total_seconds() * 1e9,
         )
-        settings.ttft_latency = round(
-            self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(
-            self.server_tpot_latency.total_seconds() * 1e9)
-        settings.min_duration_ms = round(
-            self.min_duration.total_seconds() * 1000)
+        settings.server_target_latency_percentile = (
+            self.server_target_latency_percentile
+        )
+        settings.server_coalesce_queries = self.server_coalesce_queries
+        settings.server_find_peak_qps_decimals_of_precision = (
+            self.server_find_peak_qps_decimals_of_precision
+        )
+        settings.server_find_peak_qps_boundary_step_size = (
+            self.server_find_peak_qps_boundary_step_size
+        )
+        settings.server_max_async_queries = self.server_max_async_queries
+        settings.server_num_issue_query_threads = self.server_num_issue_query_threads
+
+        # Offline-specific settings
+        settings.offline_expected_qps = self.offline_expected_qps
+        settings.sample_concatenate_permutation = self.sample_concatenate_permutation
+
+        # Test duration settings
+        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
+        settings.max_duration_ms = round(self.max_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
+        settings.max_query_count = self.max_query_count
+
+        # Random number generation settings
+        settings.qsl_rng_seed = self.qsl_rng_seed
+        settings.sample_index_rng_seed = self.sample_index_rng_seed
+        settings.schedule_rng_seed = self.schedule_rng_seed
+        settings.accuracy_log_rng_seed = self.accuracy_log_rng_seed
+        settings.accuracy_log_probability = self.accuracy_log_probability
+        settings.accuracy_log_sampling_target = self.accuracy_log_sampling_target
+
+        # Test05 settings
+        settings.test05 = self.test05
+        settings.test05_qsl_rng_seed = self.test05_qsl_rng_seed
+        settings.test05_sample_index_rng_seed = self.test05_sample_index_rng_seed
+        settings.test05_schedule_rng_seed = self.test05_schedule_rng_seed
+
+        # Performance Sample modifiers
+        settings.print_timestamps = self.print_timestamps
+        settings.performance_issue_unique = self.performance_issue_unique
+        settings.performance_issue_same = self.performance_issue_same
+        settings.performance_issue_same_index = self.performance_issue_same_index
         settings.performance_sample_count_override = (
             self.performance_sample_count_override
         )
         settings.use_token_latencies = self.use_token_latencies
+        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
+        settings.infer_token_latencies = self.infer_token_latencies
+        settings.token_latency_scaling_factor = self.token_latency_scaling_factor
+
         return settings
 
+    @staticmethod
+    def from_lgtype(lgtype: lg.TestSettings) -> TestSettings:
+        """Convert the LoadGen's test settings to the TestSettings schema."""
+        return TestSettings(
+            scenario=TestScenario.from_lgtype(lgtype.scenario),
+            mode=TestMode.from_lgtype(lgtype.mode),
+            server_target_qps=lgtype.server_target_qps,
+            server_target_latency=timedelta(
+                seconds=lgtype.server_target_latency_ns / 1e9,
+            ),
+            server_target_latency_percentile=lgtype.server_target_latency_percentile,
+            server_coalesce_queries=lgtype.server_coalesce_queries,
+            server_find_peak_qps_decimals_of_precision=lgtype.server_find_peak_qps_decimals_of_precision,
+            server_find_peak_qps_boundary_step_size=lgtype.server_find_peak_qps_boundary_step_size,
+            server_max_async_queries=lgtype.server_max_async_queries,
+            server_num_issue_query_threads=lgtype.server_num_issue_query_threads,
+            offline_expected_qps=lgtype.offline_expected_qps,
+            sample_concatenate_permutation=lgtype.sample_concatenate_permutation,
+            min_duration=timedelta(milliseconds=lgtype.min_duration_ms),
+            max_duration=timedelta(milliseconds=lgtype.max_duration_ms),
+            min_query_count=lgtype.min_query_count,
+            max_query_count=lgtype.max_query_count,
+            qsl_rng_seed=lgtype.qsl_rng_seed,
+            sample_index_rng_seed=lgtype.sample_index_rng_seed,
+            schedule_rng_seed=lgtype.schedule_rng_seed,
+            accuracy_log_rng_seed=lgtype.accuracy_log_rng_seed,
+            accuracy_log_probability=lgtype.accuracy_log_probability,
+            accuracy_log_sampling_target=lgtype.accuracy_log_sampling_target,
+            test05=lgtype.test05,
+            test05_qsl_rng_seed=lgtype.test05_qsl_rng_seed,
+            test05_sample_index_rng_seed=lgtype.test05_sample_index_rng_seed,
+            test05_schedule_rng_seed=lgtype.test05_schedule_rng_seed,
+            print_timestamps=lgtype.print_timestamps,
+            performance_issue_unique=lgtype.performance_issue_unique,
+            performance_issue_same=lgtype.performance_issue_same,
+            performance_issue_same_index=lgtype.performance_issue_same_index,
+            performance_sample_count_override=lgtype.performance_sample_count_override,
+            use_token_latencies=lgtype.use_token_latencies,
+            server_ttft_latency=timedelta(seconds=lgtype.ttft_latency / 1e9),
+            server_tpot_latency=timedelta(seconds=lgtype.tpot_latency / 1e9),
+            infer_token_latencies=lgtype.infer_token_latencies,
+            token_latency_scaling_factor=lgtype.token_latency_scaling_factor,
+        )
+
 
 class LogOutputSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
     """The test log output settings for the MLPerf inference LoadGen."""
@@ -317,15 +553,50 @@ def to_lgtype(self) -> lg.LogSettings:
         return log_settings
 
 
+class UserConf(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """The user.conf file for specifying LoadGen test settings."""
+
+    path: FilePath | None = None
+    """The path to the user.conf file. If provided, the test settings will be overridden
+    with the settings from the provided user.conf file and the mlperf.conf file from
+    inside LoadGen.
+    """
+
+    model: str = "qwen3-vl-235b-a22b"
+    """The model name that corresponds to the entries in the mlperf.conf file (in the
+    LoadGen) which defines the benchmark-wide constraints.
+    """
+
+
 class Settings(BaseModelWithAttributeDescriptionsFromDocstrings):
     """Combine the settings for the test and logging of LoadGen."""
 
     test: TestSettings
     """Test settings parameters."""
 
+    user_conf: UserConf
+    """The user.conf file for specifying LoadGen test settings."""
+
     logging: LogSettings
     """Test logging parameters."""
 
+    @model_validator(mode="after")
+    def override_test_settings_from_user_conf(self) -> Self:
+        """Override the test settings from the user.conf file."""
+        if self.user_conf.path:
+            lg_test_settings = self.test.to_lgtype()
+            lg_test_settings.FromConfig(
+                str(self.user_conf.path),
+                self.user_conf.model,
+                self.test.scenario.value.capitalize(),
+            )
+            self.test = TestSettings.from_lgtype(lg_test_settings)
+            logger.info(
+                "Loaded test settings from the user.conf and mlperf.conf files: {}",
+                self.test,
+            )
+        return self
+
     def to_lgtype(self) -> tuple[lg.TestSettings, lg.LogSettings]:
         """Return test and log settings for LoadGen."""
         test_settings = self.test.to_lgtype()
@@ -518,6 +789,5 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(
-                    message["content"])  # type: ignore[arg-type]
+                message["content"] = list(message["content"])  # type: ignore[arg-type]
         return messages

From 5370ecdd41f2e4c1f4d6fd3b5399fe6eb70eceec Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 18 Dec 2025 23:05:07 +0000
Subject: [PATCH 30/39] [Automated Commit] Format Codebase

---
 .../mlperf_inference_multimodal_vl2l/cli.py    |  4 +++-
 .../mlperf_inference_multimodal_vl2l/schema.py | 18 ++++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
index 59ff457d75..5bf896c668 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
@@ -84,7 +84,9 @@ def _run_benchmark(
     test_settings, log_settings = settings.to_lgtype()
     logger.info("Running VL2L benchmark with settings: {}", settings)
     logger.info("Running VL2L benchmark with dataset: {}", dataset)
-    logger.info("Running VL2L benchmark with OpenAI API endpoint: {}", endpoint)
+    logger.info(
+        "Running VL2L benchmark with OpenAI API endpoint: {}",
+        endpoint)
     logger.info("Running VL2L benchmark with random seed: {}", random_seed)
     task = ShopifyGlobalCatalogue(
         dataset=dataset,
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
index 8ca2646cf3..4706851843 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
@@ -366,7 +366,8 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float |
+                        str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -411,8 +412,10 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.sample_concatenate_permutation = self.sample_concatenate_permutation
 
         # Test duration settings
-        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
-        settings.max_duration_ms = round(self.max_duration.total_seconds() * 1000)
+        settings.min_duration_ms = round(
+            self.min_duration.total_seconds() * 1000)
+        settings.max_duration_ms = round(
+            self.max_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
         settings.max_query_count = self.max_query_count
 
@@ -439,8 +442,10 @@ def to_lgtype(self) -> lg.TestSettings:
             self.performance_sample_count_override
         )
         settings.use_token_latencies = self.use_token_latencies
-        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
+        settings.ttft_latency = round(
+            self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(
+            self.server_tpot_latency.total_seconds() * 1e9)
         settings.infer_token_latencies = self.infer_token_latencies
         settings.token_latency_scaling_factor = self.token_latency_scaling_factor
 
@@ -789,5 +794,6 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(message["content"])  # type: ignore[arg-type]
+                message["content"] = list(
+                    message["content"])  # type: ignore[arg-type]
         return messages

From f9d983f8dc68ba4a8ff6ed48fffcf53dec9f9a49 Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Thu, 18 Dec 2025 22:36:23 -0500
Subject: [PATCH 31/39] readme

---
 multimodal/vl2l/README.md | 41 +++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md
index fcc57c2680..0944546599 100644
--- a/multimodal/vl2l/README.md
+++ b/multimodal/vl2l/README.md
@@ -109,8 +109,26 @@ Accuracy only mode:
 mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario server --settings.test.mode accuracy_only
 ```
 
+### Pass in `user.conf`
+
+You can pass in a `user.conf` file through `--settings.user_conf.path`, such that the
+LoadGen parameters provided through the CLI will be overridden by the `user.conf` 
+provided by you and the `mlperf.conf` inside the LoadGen. An example `user.conf` file
+is included: [example_user.conf](./example_user.conf). As such, you can run the
+benchmark with `user.conf` via:
+
+```bash
+mlperf-inf-mm-vl2l benchmark endpoint \
+  --settings.test.scenario <scenario> \
+  --settings.test.mode <mode> \
+  --settings.user_conf.path example_user.conf
+```
+
 ### Evalute the response quality
 
+You should pass the `mlperf_log_accuracy.json` file (generated by LoadGen) to the
+`--filename` flag of the `mlperf-inf-mm-vl2l evaluate` command.
+
 ```bash
 mlperf-inf-mm-vl2l evaluate --filename output/mlperf_log_accuracy.json
 ```
@@ -166,7 +184,7 @@ mode with:
 mlperf-inf-mm-vl2l benchmark vllm \
     --settings.test.scenario offline \
     --settings.test.mode accuracy_only \
-    --dataset.token ... \
+    --settings.user_conf.path example_user.conf \
     --vllm.cli=--async-scheduling \
     --vllm.cli=--max-model-len=32768 \
     --vllm.cli=--max-num-seqs=1024 \
@@ -220,12 +238,27 @@ bash submit.sh --help
   - Dataset:
     - [Shopify/product-catalogue](https://huggingface.co/datasets/Shopify/product-catalogue)
     - Commit SHA: [d5c517c509f5aca99053897ef1de797d6d7e5aa5](https://huggingface.co/datasets/Shopify/product-catalogue/tree/d5c517c509f5aca99053897ef1de797d6d7e5aa5)
-  - Constraint:
+    - Both the `train` and the `test` splits are used and concatenated in that order.
+    - Total number of samples: `48289`.
+  - Guided decoding is not used.
+  - Constraints:
     - Model quality:
-      - Category Hierarchical F1 Score >= `0.7824`.
+      - Category Hierarchical F1 Score >= `0.7824`. This is the 99% recovery of 
+        `0.7903037` which is the mean category hierarchical F1 score across 10 runs. The
+        standard deviation across those 10 runs is `0.0002250412555`.
     - Server Scenario:
+      - Target latency is used as the constraint, instead of Time to First Token (TTFT)
+        or Time per Output Token (TPOT) latencies. 
       - Target latency percentile = `0.99`.
-      - Target latency <= 12 seconds.
+      - Target latency $\le$ 12 seconds.
+    - Offline Scenario:
+      - Number of samples in the query $\ge$ `48289` (i.e., every sample in the entire
+        dataset would be send to the VLM endpoint at least once).
+    - Performance sample count: `48289` (i.e., the entire dataset will be loaded into
+      the host memory, which takes ~6.39 GB). 
+    - Testing duration $\ge$ 10 mins.
+    - Sample concatenation permutation is enabled.
+
 
 ## Developer Guide
 

From f8e6bf84f3cee841a5c4f58738a45200c259df6f Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Thu, 18 Dec 2025 22:43:23 -0500
Subject: [PATCH 32/39] readme

---
 multimodal/vl2l/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md
index 0944546599..9cd847ff21 100644
--- a/multimodal/vl2l/README.md
+++ b/multimodal/vl2l/README.md
@@ -244,8 +244,9 @@ bash submit.sh --help
   - Constraints:
     - Model quality:
       - Category Hierarchical F1 Score >= `0.7824`. This is the 99% recovery of 
-        `0.7903037` which is the mean category hierarchical F1 score across 10 runs. The
-        standard deviation across those 10 runs is `0.0002250412555`.
+        `0.7903037` which is the mean category hierarchical F1 score across 10 runs on 
+        [the BF16 version of the model](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct).
+        The standard deviation across those 10 runs is `0.0002250412555`.
     - Server Scenario:
       - Target latency is used as the constraint, instead of Time to First Token (TTFT)
         or Time per Output Token (TPOT) latencies. 

From 8bfbeb98ef7c26bcddf9e776059efdb31174fa6d Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Fri, 19 Dec 2025 03:59:43 -0500
Subject: [PATCH 33/39] rename vl2l -> q3vl

---
 multimodal/{vl2l => qwen3-vl}/.gitignore      |  0
 multimodal/{vl2l => qwen3-vl}/README.md       | 66 +++++++++----------
 .../docker/vllm-cuda.Dockerfile               | 62 ++++++++---------
 .../{vl2l => qwen3-vl}/example_user.conf      |  0
 .../notebooks/shopify-global-catalogue.ipynb  |  8 +--
 multimodal/{vl2l => qwen3-vl}/pyproject.toml  | 10 +--
 .../{vl2l => qwen3-vl}/scripts/linters.sh     |  0
 .../scripts/slurm/benchmark.sh                |  2 +-
 .../scripts/slurm/evaluate.sh                 |  2 +-
 .../scripts/slurm/submit.sh                   |  2 +-
 .../src/mlperf_inf_mm_q3vl}/__init__.py       |  4 +-
 .../src/mlperf_inf_mm_q3vl}/cli.py            | 23 +++----
 .../src/mlperf_inf_mm_q3vl}/deploy.py         |  0
 .../src/mlperf_inf_mm_q3vl}/evaluation.py     | 30 +++------
 .../src/mlperf_inf_mm_q3vl}/log.py            |  5 +-
 .../src/mlperf_inf_mm_q3vl}/py.typed          |  0
 .../src/mlperf_inf_mm_q3vl}/schema.py         | 30 ++++-----
 .../src/mlperf_inf_mm_q3vl}/task.py           | 15 ++---
 18 files changed, 119 insertions(+), 140 deletions(-)
 rename multimodal/{vl2l => qwen3-vl}/.gitignore (100%)
 rename multimodal/{vl2l => qwen3-vl}/README.md (83%)
 rename multimodal/{vl2l => qwen3-vl}/docker/vllm-cuda.Dockerfile (66%)
 rename multimodal/{vl2l => qwen3-vl}/example_user.conf (100%)
 rename multimodal/{vl2l => qwen3-vl}/notebooks/shopify-global-catalogue.ipynb (99%)
 rename multimodal/{vl2l => qwen3-vl}/pyproject.toml (82%)
 rename multimodal/{vl2l => qwen3-vl}/scripts/linters.sh (100%)
 rename multimodal/{vl2l => qwen3-vl}/scripts/slurm/benchmark.sh (96%)
 rename multimodal/{vl2l => qwen3-vl}/scripts/slurm/evaluate.sh (94%)
 rename multimodal/{vl2l => qwen3-vl}/scripts/slurm/submit.sh (99%)
 rename multimodal/{vl2l/src/mlperf_inference_multimodal_vl2l => qwen3-vl/src/mlperf_inf_mm_q3vl}/__init__.py (52%)
 rename multimodal/{vl2l/src/mlperf_inference_multimodal_vl2l => qwen3-vl/src/mlperf_inf_mm_q3vl}/cli.py (83%)
 rename multimodal/{vl2l/src/mlperf_inference_multimodal_vl2l => qwen3-vl/src/mlperf_inf_mm_q3vl}/deploy.py (100%)
 rename multimodal/{vl2l/src/mlperf_inference_multimodal_vl2l => qwen3-vl/src/mlperf_inf_mm_q3vl}/evaluation.py (92%)
 rename multimodal/{vl2l/src/mlperf_inference_multimodal_vl2l => qwen3-vl/src/mlperf_inf_mm_q3vl}/log.py (88%)
 rename multimodal/{vl2l/src/mlperf_inference_multimodal_vl2l => qwen3-vl/src/mlperf_inf_mm_q3vl}/py.typed (100%)
 rename multimodal/{vl2l/src/mlperf_inference_multimodal_vl2l => qwen3-vl/src/mlperf_inf_mm_q3vl}/schema.py (96%)
 rename multimodal/{vl2l/src/mlperf_inference_multimodal_vl2l => qwen3-vl/src/mlperf_inf_mm_q3vl}/task.py (98%)

diff --git a/multimodal/vl2l/.gitignore b/multimodal/qwen3-vl/.gitignore
similarity index 100%
rename from multimodal/vl2l/.gitignore
rename to multimodal/qwen3-vl/.gitignore
diff --git a/multimodal/vl2l/README.md b/multimodal/qwen3-vl/README.md
similarity index 83%
rename from multimodal/vl2l/README.md
rename to multimodal/qwen3-vl/README.md
index 9cd847ff21..f99b0d0f15 100644
--- a/multimodal/vl2l/README.md
+++ b/multimodal/qwen3-vl/README.md
@@ -1,4 +1,4 @@
-# Reference Implementation for the Vision-language-to-language (VL2L) Benchmark 
+# Reference Implementation for the Qwen3-VL (Q3VL) Benchmark 
 
 ## Quick Start
 
@@ -11,17 +11,17 @@ on how to install Miniconda on your host machine. Then, you can create a new con
 environment via:
 
 ```bash
-conda create -n mlperf-inf-mm-vl2l python=3.12
+conda create -n mlperf-inf-mm-q3vl python=3.12
 ```
 
-### Install the VL2L benchmarking CLI
+### Install the Q3VL benchmarking CLI
 
 #### For users
 
-Install `mlperf-inf-mm-vl2l` with:
+Install `mlperf-inf-mm-q3vl` with:
 
 ```bash
-pip install git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/vl2l/
+pip install git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/
 ```
 
 #### For developers
@@ -38,29 +38,29 @@ Then enter the repo:
 cd mlperf-inference/
 ```
 
-Install `mlperf-inf-mm-vl2l` and the development tools with:
+Install `mlperf-inf-mm-q3vl` and the development tools with:
 
 - On Bash
 ```bash
-pip install -e multimodal/vl2l/[dev]
+pip install -e multimodal/qwen3-vl/[dev]
 ```
 - On Zsh
 ```zsh
-pip install -e multimodal/vl2l/"[dev]"
+pip install -e multimodal/qwen3-vl/"[dev]"
 ```
 
-### Post VL2L benchmarking CLI installation 
+### Post Q3VL benchmarking CLI installation 
 
-After installation, you can check the CLI flags that `mlperf-inf-mm-vl2l` can take with:
+After installation, you can check the CLI flags that `mlperf-inf-mm-q3vl` can take with:
 
 ```bash
-mlperf-inf-mm-vl2l --help
+mlperf-inf-mm-q3vl --help
 ```
 
-You can enable shell autocompletion for `mlperf-inf-mm-vl2l` with:
+You can enable shell autocompletion for `mlperf-inf-mm-q3vl` with:
 
 ```bash
-mlperf-inf-mm-vl2l --install-completion
+mlperf-inf-mm-q3vl --install-completion
 ```
 
 > [!NOTE]
@@ -86,13 +86,13 @@ docker run --gpus all \                                 # Use all the GPUs on th
 Performance only mode:
 
 ```bash
-mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario offline --settings.test.mode performance_only
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario offline --settings.test.mode performance_only
 ```
 
 Accuracy only mode:
 
 ```bash
-mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario offline --settings.test.mode accuracy_only
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario offline --settings.test.mode accuracy_only
 ```
 
 ### Run the benchmark for the Server scenario
@@ -100,13 +100,13 @@ mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario offline --setting
 Performance only mode:
 
 ```bash
-mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario server --settings.test.mode performance_only
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario server --settings.test.mode performance_only
 ```
 
 Accuracy only mode:
 
 ```bash
-mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario server --settings.test.mode accuracy_only
+mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario server --settings.test.mode accuracy_only
 ```
 
 ### Pass in `user.conf`
@@ -118,7 +118,7 @@ is included: [example_user.conf](./example_user.conf). As such, you can run the
 benchmark with `user.conf` via:
 
 ```bash
-mlperf-inf-mm-vl2l benchmark endpoint \
+mlperf-inf-mm-q3vl benchmark endpoint \
   --settings.test.scenario <scenario> \
   --settings.test.mode <mode> \
   --settings.user_conf.path example_user.conf
@@ -127,17 +127,17 @@ mlperf-inf-mm-vl2l benchmark endpoint \
 ### Evalute the response quality
 
 You should pass the `mlperf_log_accuracy.json` file (generated by LoadGen) to the
-`--filename` flag of the `mlperf-inf-mm-vl2l evaluate` command.
+`--filename` flag of the `mlperf-inf-mm-q3vl evaluate` command.
 
 ```bash
-mlperf-inf-mm-vl2l evaluate --filename output/mlperf_log_accuracy.json
+mlperf-inf-mm-q3vl evaluate --filename output/mlperf_log_accuracy.json
 ```
 
 ## Docker
 
-[docker/](docker/) provides examples of Dockerfiles that install the VL2L benchmarking
+[docker/](docker/) provides examples of Dockerfiles that install the Q3VL benchmarking
 CLI into the container images of the inference engine. This is useful when you have to
-run both the inference engine and the VL2L benchmarking CLI inside the same container,
+run both the inference engine and the Q3VL benchmarking CLI inside the same container,
 for example, in a situation where you must use a GPU cluster managed by 
 [Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and
 [pyxis](https://github.com/NVIDIA/pyxis).
@@ -151,27 +151,27 @@ Inference repo:
 ```bash
 docker build \
     --build-arg BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0 \
-    --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=multimodal/vl2l \
-    -f multimodal/vl2l/docker/vllm-cuda.Dockerfile \
-    -t mlperf-inf-mm-vl2l:vllm-openai-v0.12.0 \
+    --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=multimodal/qwen3-vl \
+    -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \
+    -t mlperf-inf-mm-q3vl:vllm-openai-v0.12.0 \
     .
 ```
 > [!NOTE]
-> `MLPERF_INF_MM_VL2L_INSTALL_URL` can also take in a remote GitHub location, such as
-> `git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/vl2l/`.
+> `MLPERF_INF_MM_Q3VL_INSTALL_URL` can also take in a remote GitHub location, such as
+> `git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/`.
 
 2. Afterwards, you can start the container in the interactive mode by
 
 ```bash
-docker run --rm -it --gpus all -v ~/.cache:/root/.cache --ipc=host mlperf-inf-mm-vl2l:vllm-openai-v0.12.0
+docker run --rm -it --gpus all -v ~/.cache:/root/.cache --ipc=host mlperf-inf-mm-q3vl:vllm-openai-v0.12.0
 ```
 
 ### Benchmark against vLLM inside the container
 
-If you are running `mlperf-inf-mm-vl2l` inside a local environment that has access to
+If you are running `mlperf-inf-mm-q3vl` inside a local environment that has access to
 vLLM (such as inside a container that was created using the 
 [docker/vllm-cuda.Dockerfile](docker/vllm-cuda.Dockerfile)), you can use a single
-`mlperf-inf-mm-vl2l benchmark vllm` command to achieve:
+`mlperf-inf-mm-q3vl benchmark vllm` command to achieve:
 
 1. Deploy an endpoint using vLLM.
 2. Wait for the endpoint to be healthy.
@@ -181,7 +181,7 @@ For example, inside the container, you can run the Offline scenario Accuracy onl
 mode with:
 
 ```bash
-mlperf-inf-mm-vl2l benchmark vllm \
+mlperf-inf-mm-q3vl benchmark vllm \
     --settings.test.scenario offline \
     --settings.test.mode accuracy_only \
     --settings.user_conf.path example_user.conf \
@@ -265,8 +265,8 @@ bash submit.sh --help
 
 ### Linting
 
-You can lint the VL2L benchmark source code by running the following script:
+You can lint the Q3VL benchmark source code by running the following script:
 
 ```bash
-bash multimodal/vl2l/scripts/linters.sh
+bash multimodal/qwen3-vl/scripts/linters.sh
 ```
\ No newline at end of file
diff --git a/multimodal/vl2l/docker/vllm-cuda.Dockerfile b/multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile
similarity index 66%
rename from multimodal/vl2l/docker/vllm-cuda.Dockerfile
rename to multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile
index 756cb05a3a..a54bda1364 100644
--- a/multimodal/vl2l/docker/vllm-cuda.Dockerfile
+++ b/multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile
@@ -9,33 +9,33 @@
 #      docker build -t myimage .
 #
 # 2. Install from a different git URL or branch:
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=git+https://github.com/USER/REPO.git@BRANCH#subdirectory=multimodal/vl2l \
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=git+https://github.com/USER/REPO.git@BRANCH#subdirectory=multimodal/qwen3-vl \
 #                   -t myimage .
 #
 # 3. Install from local directory (build from repo root with git auto-detection):
 #    (Version number will be auto-detected from git if the build context includes .git)
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=multimodal/vl2l \
-#                   -f multimodal/vl2l/docker/vllm-cuda.Dockerfile \
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=multimodal/qwen3-vl \
+#                   -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \
 #                   -t myimage .
 #
-# 4. Install from local directory (build from multimodal/vl2l subdirectory):
+# 4. Install from local directory (build from multimodal/qwen3-vl subdirectory):
 #    (No .git in subdirectory, will use fallback version "0.0.0.dev0")
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=. \
-#                   -f multimodal/vl2l/docker/vllm-cuda.Dockerfile \
-#                   -t myimage multimodal/vl2l
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=. \
+#                   -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \
+#                   -t myimage multimodal/qwen3-vl
 #
-# 5. Install from local directory when pwd is already multimodal/vl2l:
+# 5. Install from local directory when pwd is already multimodal/qwen3-vl:
 #    (No .git in subdirectory, will use fallback version "0.0.0.dev0")
-#      cd multimodal/vl2l
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=. \
+#      cd multimodal/qwen3-vl
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=. \
 #                   -f docker/vllm-cuda.Dockerfile \
 #                   -t myimage .
 #
 # 6. Install from local directory with a custom fallback version:
 #    (Override the default "0.0.0.dev0" version when git is not available)
-#      cd multimodal/vl2l
-#      docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=. \
-#                   --build-arg MLPERF_INF_MM_VL2L_VERSION=1.0.0 \
+#      cd multimodal/qwen3-vl
+#      docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=. \
+#                   --build-arg MLPERF_INF_MM_Q3VL_VERSION=1.0.0 \
 #                   -f docker/vllm-cuda.Dockerfile \
 #                   -t myimage .
 #
@@ -48,26 +48,26 @@
 ARG BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0
 FROM ${BASE_IMAGE_URL}
 
-# MLPERF_INF_MM_VL2L_INSTALL_URL can be either:
+# MLPERF_INF_MM_Q3VL_INSTALL_URL can be either:
 #   1. A git URL (default): git+https://github.com/...
-#   2. A local directory path relative to the build context (e.g., multimodal/vl2l)
+#   2. A local directory path relative to the build context (e.g., multimodal/qwen3-vl)
 #      Note: The build context is the directory you pass to `docker build` (the final arg)
-#            MLPERF_INF_MM_VL2L_INSTALL_URL must be a valid path inside that build context
-ARG MLPERF_INF_MM_VL2L_INSTALL_URL=git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/vl2l
+#            MLPERF_INF_MM_Q3VL_INSTALL_URL must be a valid path inside that build context
+ARG MLPERF_INF_MM_Q3VL_INSTALL_URL=git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl
 
 # Temporary directory inside the container where the build context will be copied
 # Only used when installing from a local directory path
-ARG BUILD_CONTEXT_DIR=/tmp/mm_vl2l_build_context
+ARG BUILD_CONTEXT_DIR=/tmp/mm_q3vl_build_context
 
 # Fallback version to use when building from local directory without git metadata
 # setuptools-scm will first try to detect version from .git, and use this as fallback
 # Must be a valid PEP 440 version string (e.g., "0.0.0.dev0", "1.0.0", "0.1.0.dev1")
 # Can be overridden at build time with --build-arg
-ARG MLPERF_INF_MM_VL2L_VERSION=0.0.0.dev0
+ARG MLPERF_INF_MM_Q3VL_VERSION=0.0.0.dev0
 
 # Install
 # - git (required for installing "git+..." dependencies to work)
-# - tmux (for `vllm serve` and `mlperf-inf-mm-vl2l` in different tmux sessions)
+# - tmux (for `vllm serve` and `mlperf-inf-mm-q3vl` in different tmux sessions)
 # - vim (for editing files in the container)
 RUN apt-get update && \
     apt-get install -y git tmux vim && \
@@ -79,25 +79,25 @@ RUN apt-get update && \
 #ENV LD_LIBRARY_PATH=/usr/local/lib/python3.12/dist-packages/torch/lib:$LD_LIBRARY_PATH
 
 # Copy build context.
-# This will be used only if MLPERF_INF_MM_VL2L_INSTALL_URL is a local path.
+# This will be used only if MLPERF_INF_MM_Q3VL_INSTALL_URL is a local path.
 COPY . ${BUILD_CONTEXT_DIR}/
 
-# Install the mlperf-inference-multimodal-vl2l package.
+# Install the mlperf-inference-multimodal-q3vl package.
 # We use --system to install into the container's global python environment.
-# Detect if MLPERF_INF_MM_VL2L_INSTALL_URL is a git URL or a local path:
-RUN if echo "${MLPERF_INF_MM_VL2L_INSTALL_URL}" | grep -q "^git+"; then \
-        echo "Installing from git URL: ${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \
-        uv pip install --system --no-cache --verbose "${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \
+# Detect if MLPERF_INF_MM_Q3VL_INSTALL_URL is a git URL or a local path:
+RUN if echo "${MLPERF_INF_MM_Q3VL_INSTALL_URL}" | grep -q "^git+"; then \
+        echo "Installing from git URL: ${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \
+        uv pip install --system --no-cache --verbose "${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \
     else \
-        echo "Installing from local path: ${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \
+        echo "Installing from local path: ${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \
         # Check if the package directory is inside a git repository \
-        if cd "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_VL2L_INSTALL_URL}" && git rev-parse --git-dir > /dev/null 2>&1; then \
+        if cd "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_Q3VL_INSTALL_URL}" && git rev-parse --git-dir > /dev/null 2>&1; then \
             echo "Git repository detected, setuptools-scm will detect version automatically"; \
         else \
-            echo "Not in a git repository, using fallback version: ${MLPERF_INF_MM_VL2L_VERSION}"; \
-            export SETUPTOOLS_SCM_PRETEND_VERSION_FOR_MLPERF_INFERENCE_MULTIMODAL_VL2L="${MLPERF_INF_MM_VL2L_VERSION}"; \
+            echo "Not in a git repository, using fallback version: ${MLPERF_INF_MM_Q3VL_VERSION}"; \
+            export SETUPTOOLS_SCM_PRETEND_VERSION_FOR_MLPERF_INF_MM_Q3VL="${MLPERF_INF_MM_Q3VL_VERSION}"; \
         fi; \
-        uv pip install --system --no-cache --verbose "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \
+        uv pip install --system --no-cache --verbose "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \
     fi && \
     rm -rf "${BUILD_CONTEXT_DIR}"
 
diff --git a/multimodal/vl2l/example_user.conf b/multimodal/qwen3-vl/example_user.conf
similarity index 100%
rename from multimodal/vl2l/example_user.conf
rename to multimodal/qwen3-vl/example_user.conf
diff --git a/multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb b/multimodal/qwen3-vl/notebooks/shopify-global-catalogue.ipynb
similarity index 99%
rename from multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb
rename to multimodal/qwen3-vl/notebooks/shopify-global-catalogue.ipynb
index d973c74014..682398ac37 100644
--- a/multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb
+++ b/multimodal/qwen3-vl/notebooks/shopify-global-catalogue.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": null,
    "id": "f194bfdf-c9f1-4738-bdb5-258dd4bc05f0",
    "metadata": {},
    "outputs": [],
@@ -29,7 +29,7 @@
     "from io import BytesIO\n",
     "import base64\n",
     "import pprint\n",
-    "from mlperf_inference_multimodal_vl2l.task import Task\n",
+    "from mlperf_inf_mm_q3vl.task import Task\n",
     "from openai import AsyncOpenAI, DefaultAioHttpClient\n",
     "import numpy as np\n",
     "import json\n",
@@ -451,12 +451,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "id": "b6aa7372",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mlperf_inference_multimodal_vl2l.task import ProductMetadata\n",
+    "from mlperf_inf_mm_q3vl.task import ProductMetadata\n",
     "\n",
     "def build_messages(sample):\n",
     "    image_file = BytesIO()\n",
diff --git a/multimodal/vl2l/pyproject.toml b/multimodal/qwen3-vl/pyproject.toml
similarity index 82%
rename from multimodal/vl2l/pyproject.toml
rename to multimodal/qwen3-vl/pyproject.toml
index 1d1d90ec75..255b3f4e16 100644
--- a/multimodal/vl2l/pyproject.toml
+++ b/multimodal/qwen3-vl/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
-name = "mlperf-inference-multimodal-vl2l"
-description = "The reference implementation for the vision-language-to-language (VL2L) benchmark in MLPerf Inference"
+name = "mlperf-inf-mm-q3vl"
+description = "The reference implementation for the Qwen3-VL (Q3VL) benchmark in MLPerf Inference"
 readme = "README.md"
 classifiers = [
   "Programming Language :: Python :: 3",
@@ -30,10 +30,10 @@ dynamic = ["version"]
 dev = ["black", "ruff", "mypy", "shellcheck-py", "pytest"]
 
 [project.scripts]
-mlperf-inf-mm-vl2l = "mlperf_inference_multimodal_vl2l.cli:app"
+mlperf-inf-mm-q3vl = "mlperf_inf_mm_q3vl.cli:app"
 
 [project.urls]
-Homepage = "https://github.com/mlcommons/inference/multimodal/vl2l"
+Homepage = "https://github.com/mlcommons/inference/multimodal/qwen3-vl"
 
 [build-system]
 requires = ["setuptools>=80", "setuptools-scm[simple]>=8"]
@@ -43,7 +43,7 @@ build-backend = "setuptools.build_meta"
 where = ["src"]
 
 [tool.setuptools.package-data]
-"mlperf_inference_multimodal_vl2l" = ["py.typed"]
+"mlperf_inf_mm_q3vl" = ["py.typed"]
 
 [tool.setuptools_scm]
 root = "../../"
diff --git a/multimodal/vl2l/scripts/linters.sh b/multimodal/qwen3-vl/scripts/linters.sh
similarity index 100%
rename from multimodal/vl2l/scripts/linters.sh
rename to multimodal/qwen3-vl/scripts/linters.sh
diff --git a/multimodal/vl2l/scripts/slurm/benchmark.sh b/multimodal/qwen3-vl/scripts/slurm/benchmark.sh
similarity index 96%
rename from multimodal/vl2l/scripts/slurm/benchmark.sh
rename to multimodal/qwen3-vl/scripts/slurm/benchmark.sh
index 3c2a118b07..00167cd3b3 100644
--- a/multimodal/vl2l/scripts/slurm/benchmark.sh
+++ b/multimodal/qwen3-vl/scripts/slurm/benchmark.sh
@@ -17,7 +17,7 @@ srun \
     --container-image="${CONTAINER_IMAGE}" \
     --container-mounts="${CACHE_HOST_DIR}":"${CACHE_CONTAINER_DIR}","${OUTPUT_HOST_DIR}":"${OUTPUT_CONTAINER_DIR}" \
     --no-container-mount-home \
-    mlperf-inf-mm-vl2l benchmark vllm \
+    mlperf-inf-mm-q3vl benchmark vllm \
         --settings.test.scenario="${SCENARIO}" \
         --settings.test.mode="${MODE}" \
         --settings.test.server_expected_qps="${SERVER_EXPECTED_QPS}" \
diff --git a/multimodal/vl2l/scripts/slurm/evaluate.sh b/multimodal/qwen3-vl/scripts/slurm/evaluate.sh
similarity index 94%
rename from multimodal/vl2l/scripts/slurm/evaluate.sh
rename to multimodal/qwen3-vl/scripts/slurm/evaluate.sh
index 4ae554b61d..54615f2e33 100644
--- a/multimodal/vl2l/scripts/slurm/evaluate.sh
+++ b/multimodal/qwen3-vl/scripts/slurm/evaluate.sh
@@ -17,5 +17,5 @@ srun \
     --container-mounts="${CACHE_HOST_DIR}":"${CACHE_CONTAINER_DIR}","${OUTPUT_HOST_DIR}":"${OUTPUT_CONTAINER_DIR}" \
     --no-container-mount-home \
     --container-env=NVIDIA_VISIBLE_DEVICES \
-    mlperf-inf-mm-vl2l evaluate \
+    mlperf-inf-mm-q3vl evaluate \
         --filename="${OUTPUT_CONTAINER_DIR}"/"${BENCHMARK_JOB_ID}"/mlperf_log_accuracy.json
\ No newline at end of file
diff --git a/multimodal/vl2l/scripts/slurm/submit.sh b/multimodal/qwen3-vl/scripts/slurm/submit.sh
similarity index 99%
rename from multimodal/vl2l/scripts/slurm/submit.sh
rename to multimodal/qwen3-vl/scripts/slurm/submit.sh
index 34af7bd89f..8e07336d7f 100644
--- a/multimodal/vl2l/scripts/slurm/submit.sh
+++ b/multimodal/qwen3-vl/scripts/slurm/submit.sh
@@ -38,7 +38,7 @@ evaluate_slurm_partition=${DEFAULT_EVALUATE_SLURM_PARTITION}
 
 function _exit_with_help_msg() {
     cat <<EOF
-Submit a benchmarking (and optionally, an evaluation) job(s) for the VL2L benchmark.
+Submit a benchmarking (and optionally, an evaluation) job(s) for the Qwen3-VL (Q3VL) benchmark.
 
 Usage: ${BASH_SOURCE[0]}
     [-ci  | --container-image]     Container image to run the benchmark (default: ${DEFAULT_CONTAINER_IMAGE}).
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/__init__.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/__init__.py
similarity index 52%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/__init__.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/__init__.py
index 61579c28c8..f964062b1e 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/__init__.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/__init__.py
@@ -1,4 +1,4 @@
-"""Reference Implementation for the Vision-language-to-language (VL2L) Benchmark."""
+"""Reference Implementation for the Qwen3-VL (Q3VL) Benchmark."""
 
 from __future__ import annotations
 
@@ -6,4 +6,4 @@
 from importlib.metadata import PackageNotFoundError, version
 
 with contextlib.suppress(PackageNotFoundError):
-    __version__ = version("mlperf-inference-multimodal-vl2l")
+    __version__ = version("mlperf-inf-mm-q3vl")
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
similarity index 83%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
index 5bf896c668..9800681558 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
@@ -1,4 +1,4 @@
-"""The CLI definition for the VL2L benchmark."""
+"""The CLI definition for the Qwen3-VL (Q3VL) benchmark."""
 
 from __future__ import annotations
 
@@ -21,7 +21,7 @@
 app.add_typer(
     benchmark_app,
     name="benchmark",
-    help="Main CLI for running the VL2L benchmark.",
+    help="Main CLI for running the Qwen3-VL (Q3VL) benchmark.",
 )
 
 
@@ -80,14 +80,15 @@ def _run_benchmark(
     endpoint: Endpoint,
     random_seed: int,
 ) -> None:
-    """Run the VL2L benchmark."""
-    test_settings, log_settings = settings.to_lgtype()
-    logger.info("Running VL2L benchmark with settings: {}", settings)
-    logger.info("Running VL2L benchmark with dataset: {}", dataset)
+    """Run the Qwen3-VL (Q3VL) benchmark."""
+    logger.info("Running Qwen3-VL (Q3VL) benchmark with settings: {}", settings)
+    logger.info("Running Qwen3-VL (Q3VL) benchmark with dataset: {}", dataset)
     logger.info(
-        "Running VL2L benchmark with OpenAI API endpoint: {}",
-        endpoint)
-    logger.info("Running VL2L benchmark with random seed: {}", random_seed)
+        "Running Qwen3-VL (Q3VL) benchmark with OpenAI API endpoint: {}",
+        endpoint,
+    )
+    logger.info("Running Qwen3-VL (Q3VL) benchmark with random seed: {}", random_seed)
+    test_settings, log_settings = settings.to_lgtype()
     task = ShopifyGlobalCatalogue(
         dataset=dataset,
         endpoint=endpoint,
@@ -96,9 +97,9 @@ def _run_benchmark(
     )
     sut = task.construct_sut()
     qsl = task.construct_qsl()
-    logger.info("Starting the VL2L benchmark with LoadGen...")
+    logger.info("Starting the Qwen3-VL (Q3VL) benchmark with LoadGen...")
     lg.StartTestWithLogSettings(sut, qsl, test_settings, log_settings)
-    logger.info("The VL2L benchmark with LoadGen completed.")
+    logger.info("The Qwen3-VL (Q3VL) benchmark with LoadGen completed.")
     lg.DestroyQSL(qsl)
     lg.DestroySUT(sut)
 
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/deploy.py
similarity index 100%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/deploy.py
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py
similarity index 92%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py
index 3f99787008..35ecced1fc 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py
@@ -1,4 +1,4 @@
-"""Task definitions for the VL2L benchmark."""
+"""Task definitions for the Qwen3-VL (Q3VL) benchmark."""
 
 from __future__ import annotations
 
@@ -62,8 +62,7 @@ def get_hierarchical_components(
     intersection_count = 0
 
     # Iterate through the paths simultaneously
-    for pred_cat, true_cat in zip(
-            predicted_categories, true_categories, strict=False):
+    for pred_cat, true_cat in zip(predicted_categories, true_categories, strict=False):
         if pred_cat == true_cat:
             intersection_count += 1
         else:
@@ -229,8 +228,7 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=local_rng.choice(
-                    [True, False], size=1).tolist()[0],
+                is_secondhand=local_rng.choice([True, False], size=1).tolist()[0],
             )
             error_messages.append(
                 (
@@ -253,16 +251,14 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
         )
         # random category selection
         # Uniform distribution is the default
-        rand_cat = local_rng.choice(
-            ground_truth_item["potential_product_categories"])
+        rand_cat = local_rng.choice(ground_truth_item["potential_product_categories"])
         category_rand_pred_src.append(
             (rand_cat, ground_truth_item["ground_truth_category"]),
         )
         # random is_secondhand selection
         rand_is_secondhand = local_rng.choice([True, False])
         is_secondhand_rand_pred_src.append(
-            (rand_is_secondhand,
-             ground_truth_item["ground_truth_is_secondhand"]),
+            (rand_is_secondhand, ground_truth_item["ground_truth_is_secondhand"]),
         )
 
     return {
@@ -277,8 +273,7 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
     }
 
 
-def run_evaluation(random_seed: int, filename: FilePath,
-                   dataset: DatasetCLI) -> None:
+def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) -> None:
     """Main function to run the evaluation."""
     master_rng = np.random.default_rng(seed=random_seed)
     with Path.open(filename) as f:
@@ -295,7 +290,7 @@ def run_evaluation(random_seed: int, filename: FilePath,
     chunk_size = max(len(model_output) // cpu_count, 1)
     # Create chunks
     output_chunks = [
-        model_output[i: i + chunk_size]
+        model_output[i : i + chunk_size]
         for i in range(0, len(model_output), chunk_size)
     ]
 
@@ -331,8 +326,7 @@ def run_evaluation(random_seed: int, filename: FilePath,
         category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
         category_rand_pred_src.extend(chunk["category_rand_pred_src"])
         is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
-        is_secondhand_rand_pred_src.extend(
-            chunk["is_secondhand_rand_pred_src"])
+        is_secondhand_rand_pred_src.extend(chunk["is_secondhand_rand_pred_src"])
         brand_pred_src.extend(chunk["brand_pred_src"])
         all_possible_brands.extend(chunk["all_possible_brands"])
 
@@ -345,8 +339,7 @@ def run_evaluation(random_seed: int, filename: FilePath,
 
     rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
 
-    rand_is_seconhand_f1_score = calculate_secondhand_f1(
-        is_secondhand_rand_pred_src)
+    rand_is_seconhand_f1_score = calculate_secondhand_f1(is_secondhand_rand_pred_src)
 
     all_brands_list = list(set(all_possible_brands))
     random_brand_predictions = master_rng.choice(
@@ -361,10 +354,7 @@ def run_evaluation(random_seed: int, filename: FilePath,
 
     with ProcessPoolExecutor() as executor:
         rand_brand_data = list(
-            executor.map(
-                _process_chunk_rnd_brand,
-                args_list,
-                chunksize=chunk_size),
+            executor.map(_process_chunk_rnd_brand, args_list, chunksize=chunk_size),
         )
 
     rand_brand_score = calculate_brand_f1_score(
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/log.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py
similarity index 88%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/log.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py
index a24eb514f0..b47a699f07 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/log.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py
@@ -1,4 +1,4 @@
-"""Logging utilities for the VL2L benchmark."""
+"""Logging utilities for the Qwen3-VL (Q3VL) benchmark."""
 
 from __future__ import annotations
 
@@ -32,8 +32,7 @@ def get_log_file_path(key: str, settings: Settings) -> Path:
     )
 
 
-def setup_loguru_for_benchmark(
-        settings: Settings, verbosity: Verbosity) -> None:
+def setup_loguru_for_benchmark(settings: Settings, verbosity: Verbosity) -> None:
     """Setup the loguru logger for running the benchmark."""
     logger.remove()
     logger.add(sys.stdout, level=verbosity.value.upper())
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/py.typed b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/py.typed
similarity index 100%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/py.typed
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/py.typed
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
similarity index 96%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
index 4706851843..bb2fa7c00d 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
@@ -1,4 +1,4 @@
-"""Schema definitions of various data structures in the VL2L benchmark."""
+"""Schema definitions of various data structures in the Qwen3-VL (Q3VL) benchmark."""
 
 from __future__ import annotations
 
@@ -366,8 +366,7 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float |
-                        str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -412,10 +411,8 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.sample_concatenate_permutation = self.sample_concatenate_permutation
 
         # Test duration settings
-        settings.min_duration_ms = round(
-            self.min_duration.total_seconds() * 1000)
-        settings.max_duration_ms = round(
-            self.max_duration.total_seconds() * 1000)
+        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
+        settings.max_duration_ms = round(self.max_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
         settings.max_query_count = self.max_query_count
 
@@ -442,10 +439,8 @@ def to_lgtype(self) -> lg.TestSettings:
             self.performance_sample_count_override
         )
         settings.use_token_latencies = self.use_token_latencies
-        settings.ttft_latency = round(
-            self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(
-            self.server_tpot_latency.total_seconds() * 1e9)
+        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
         settings.infer_token_latencies = self.infer_token_latencies
         settings.token_latency_scaling_factor = self.token_latency_scaling_factor
 
@@ -610,7 +605,7 @@ def to_lgtype(self) -> tuple[lg.TestSettings, lg.LogSettings]:
 
 
 class Model(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """Specifies the model to use for the VL2L benchmark."""
+    """Specifies the model to use for the Qwen3-VL (Q3VL) benchmark."""
 
     repo_id: str = "Qwen/Qwen3-VL-235B-A22B-Instruct"
     """The HuggingFace repository ID of the model."""
@@ -656,7 +651,7 @@ class Verbosity(StrEnum):
 
 
 class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings):
-    """Specifies the OpenAI API endpoint to use for the VL2L benchmark."""
+    """Specifies the OpenAI API endpoint to use for the Qwen3-VL (Q3VL) benchmark."""
 
     url: str = "http://localhost:8000/v1"
     """The URL of the OpenAI API endpoint that the inference requests are sent to."""
@@ -665,8 +660,8 @@ class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings):
     """The API key to authenticate the inference requests."""
 
     model: Model
-    """The model to use for the VL2L benchmark, i.e., the model that was deployed behind
-    this OpenAI API endpoint.
+    """The model to use for the Qwen3-VL (Q3VL) benchmark, i.e., the model that was
+    deployed behind this OpenAI API endpoint.
     """
 
     use_guided_decoding: bool = False
@@ -686,7 +681,7 @@ class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings):
 
 
 class EndpointToDeploy(Endpoint):
-    """Specifies the endpoint to deploy for the VL2L benchmark."""
+    """Specifies the endpoint to deploy for the Qwen3-VL (Q3VL) benchmark."""
 
     startup_timeout: timedelta = timedelta(hours=1)
     """The timeout for the endpoint to start up."""
@@ -794,6 +789,5 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(
-                    message["content"])  # type: ignore[arg-type]
+                message["content"] = list(message["content"])  # type: ignore[arg-type]
         return messages
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
similarity index 98%
rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
index 6fab0b6409..8df7d543bf 100644
--- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
@@ -1,4 +1,4 @@
-"""Task definitions for the VL2L benchmark."""
+"""Task definitions for the Qwen3-VL (Q3VL) benchmark."""
 
 from __future__ import annotations
 
@@ -67,8 +67,7 @@ def __init__(
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
             http_client=DefaultAioHttpClient(
-                timeout=httpx.Timeout(
-                    timeout=request_timeout_seconds, connect=5.0),
+                timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0),
             ),
             api_key=endpoint.api_key,
             timeout=request_timeout_seconds,
@@ -188,9 +187,7 @@ def estimated_num_performance_samples(self) -> int:
         """
         estimation_indices = random.sample(
             range(self.total_num_samples),
-            k=min(
-                MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES,
-                self.total_num_samples),
+            k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples),
         )
         estimation_samples = [
             self.formulate_loaded_sample(
@@ -277,8 +274,7 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             _unload_samples_from_ram,
         )
 
-    async def _query_endpoint_async_batch(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         try:
             sample = self.loaded_samples[query_sample.index]
@@ -355,8 +351,7 @@ async def _query_endpoint_async_batch(
                 ],
             )
 
-    async def _query_endpoint_async_stream(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         ttft_set = False
         try:

From b589ddd6b69a51dac26f23488448c9d6f8d619bb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 19 Dec 2025 09:00:17 +0000
Subject: [PATCH 34/39] [Automated Commit] Format Codebase

---
 .../qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py    |  8 ++++--
 .../src/mlperf_inf_mm_q3vl/evaluation.py      | 28 +++++++++++++------
 .../qwen3-vl/src/mlperf_inf_mm_q3vl/log.py    |  3 +-
 .../qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py | 18 ++++++++----
 .../qwen3-vl/src/mlperf_inf_mm_q3vl/task.py   | 13 ++++++---
 5 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
index 9800681558..23fb1f7a72 100644
--- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
@@ -81,13 +81,17 @@ def _run_benchmark(
     random_seed: int,
 ) -> None:
     """Run the Qwen3-VL (Q3VL) benchmark."""
-    logger.info("Running Qwen3-VL (Q3VL) benchmark with settings: {}", settings)
+    logger.info(
+        "Running Qwen3-VL (Q3VL) benchmark with settings: {}",
+        settings)
     logger.info("Running Qwen3-VL (Q3VL) benchmark with dataset: {}", dataset)
     logger.info(
         "Running Qwen3-VL (Q3VL) benchmark with OpenAI API endpoint: {}",
         endpoint,
     )
-    logger.info("Running Qwen3-VL (Q3VL) benchmark with random seed: {}", random_seed)
+    logger.info(
+        "Running Qwen3-VL (Q3VL) benchmark with random seed: {}",
+        random_seed)
     test_settings, log_settings = settings.to_lgtype()
     task = ShopifyGlobalCatalogue(
         dataset=dataset,
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py
index 35ecced1fc..7bf59ce302 100644
--- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py
@@ -62,7 +62,8 @@ def get_hierarchical_components(
     intersection_count = 0
 
     # Iterate through the paths simultaneously
-    for pred_cat, true_cat in zip(predicted_categories, true_categories, strict=False):
+    for pred_cat, true_cat in zip(
+            predicted_categories, true_categories, strict=False):
         if pred_cat == true_cat:
             intersection_count += 1
         else:
@@ -228,7 +229,8 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
                     ),
                 ),
                 brand=_PRED_BRAND_PAD,
-                is_secondhand=local_rng.choice([True, False], size=1).tolist()[0],
+                is_secondhand=local_rng.choice(
+                    [True, False], size=1).tolist()[0],
             )
             error_messages.append(
                 (
@@ -251,14 +253,16 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
         )
         # random category selection
         # Uniform distribution is the default
-        rand_cat = local_rng.choice(ground_truth_item["potential_product_categories"])
+        rand_cat = local_rng.choice(
+            ground_truth_item["potential_product_categories"])
         category_rand_pred_src.append(
             (rand_cat, ground_truth_item["ground_truth_category"]),
         )
         # random is_secondhand selection
         rand_is_secondhand = local_rng.choice([True, False])
         is_secondhand_rand_pred_src.append(
-            (rand_is_secondhand, ground_truth_item["ground_truth_is_secondhand"]),
+            (rand_is_secondhand,
+             ground_truth_item["ground_truth_is_secondhand"]),
         )
 
     return {
@@ -273,7 +277,8 @@ def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]:
     }
 
 
-def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) -> None:
+def run_evaluation(random_seed: int, filename: FilePath,
+                   dataset: DatasetCLI) -> None:
     """Main function to run the evaluation."""
     master_rng = np.random.default_rng(seed=random_seed)
     with Path.open(filename) as f:
@@ -290,7 +295,7 @@ def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) ->
     chunk_size = max(len(model_output) // cpu_count, 1)
     # Create chunks
     output_chunks = [
-        model_output[i : i + chunk_size]
+        model_output[i: i + chunk_size]
         for i in range(0, len(model_output), chunk_size)
     ]
 
@@ -326,7 +331,8 @@ def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) ->
         category_dataset_pred_src.extend(chunk["category_dataset_pred_src"])
         category_rand_pred_src.extend(chunk["category_rand_pred_src"])
         is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"])
-        is_secondhand_rand_pred_src.extend(chunk["is_secondhand_rand_pred_src"])
+        is_secondhand_rand_pred_src.extend(
+            chunk["is_secondhand_rand_pred_src"])
         brand_pred_src.extend(chunk["brand_pred_src"])
         all_possible_brands.extend(chunk["all_possible_brands"])
 
@@ -339,7 +345,8 @@ def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) ->
 
     rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src)
 
-    rand_is_seconhand_f1_score = calculate_secondhand_f1(is_secondhand_rand_pred_src)
+    rand_is_seconhand_f1_score = calculate_secondhand_f1(
+        is_secondhand_rand_pred_src)
 
     all_brands_list = list(set(all_possible_brands))
     random_brand_predictions = master_rng.choice(
@@ -354,7 +361,10 @@ def run_evaluation(random_seed: int, filename: FilePath, dataset: DatasetCLI) ->
 
     with ProcessPoolExecutor() as executor:
         rand_brand_data = list(
-            executor.map(_process_chunk_rnd_brand, args_list, chunksize=chunk_size),
+            executor.map(
+                _process_chunk_rnd_brand,
+                args_list,
+                chunksize=chunk_size),
         )
 
     rand_brand_score = calculate_brand_f1_score(
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py
index b47a699f07..56700ef7d8 100644
--- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py
@@ -32,7 +32,8 @@ def get_log_file_path(key: str, settings: Settings) -> Path:
     )
 
 
-def setup_loguru_for_benchmark(settings: Settings, verbosity: Verbosity) -> None:
+def setup_loguru_for_benchmark(
+        settings: Settings, verbosity: Verbosity) -> None:
     """Setup the loguru logger for running the benchmark."""
     logger.remove()
     logger.add(sys.stdout, level=verbosity.value.upper())
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
index bb2fa7c00d..8f16ba9715 100644
--- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
@@ -366,7 +366,8 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float |
+                        str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -411,8 +412,10 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.sample_concatenate_permutation = self.sample_concatenate_permutation
 
         # Test duration settings
-        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
-        settings.max_duration_ms = round(self.max_duration.total_seconds() * 1000)
+        settings.min_duration_ms = round(
+            self.min_duration.total_seconds() * 1000)
+        settings.max_duration_ms = round(
+            self.max_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
         settings.max_query_count = self.max_query_count
 
@@ -439,8 +442,10 @@ def to_lgtype(self) -> lg.TestSettings:
             self.performance_sample_count_override
         )
         settings.use_token_latencies = self.use_token_latencies
-        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
+        settings.ttft_latency = round(
+            self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(
+            self.server_tpot_latency.total_seconds() * 1e9)
         settings.infer_token_latencies = self.infer_token_latencies
         settings.token_latency_scaling_factor = self.token_latency_scaling_factor
 
@@ -789,5 +794,6 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(message["content"])  # type: ignore[arg-type]
+                message["content"] = list(
+                    message["content"])  # type: ignore[arg-type]
         return messages
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
index 8df7d543bf..f05147acc6 100644
--- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
@@ -67,7 +67,8 @@ def __init__(
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
             http_client=DefaultAioHttpClient(
-                timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0),
+                timeout=httpx.Timeout(
+                    timeout=request_timeout_seconds, connect=5.0),
             ),
             api_key=endpoint.api_key,
             timeout=request_timeout_seconds,
@@ -187,7 +188,9 @@ def estimated_num_performance_samples(self) -> int:
         """
         estimation_indices = random.sample(
             range(self.total_num_samples),
-            k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples),
+            k=min(
+                MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES,
+                self.total_num_samples),
         )
         estimation_samples = [
             self.formulate_loaded_sample(
@@ -274,7 +277,8 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             _unload_samples_from_ram,
         )
 
-    async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_batch(
+            self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         try:
             sample = self.loaded_samples[query_sample.index]
@@ -351,7 +355,8 @@ async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> Non
                 ],
             )
 
-    async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_stream(
+            self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         ttft_set = False
         try:

From 3b065ee9440097f0297fb61ab632db35caf32406 Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Fri, 19 Dec 2025 04:00:48 -0500
Subject: [PATCH 35/39] empty


From eb6559060aab729e6a763ea4931234cb7f0219fb Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Fri, 19 Dec 2025 04:10:24 -0500
Subject: [PATCH 36/39] rerun ci


From 38ff6f9a2734d570429975d02772ea5f956a060c Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Fri, 19 Dec 2025 12:14:50 -0500
Subject: [PATCH 37/39] rerun ci


From c1534aeb133631201f87b2eb83f1ca7dedac31d5 Mon Sep 17 00:00:00 2001
From: Shang Wang <samshang.wang@mail.utoronto.ca>
Date: Fri, 19 Dec 2025 20:33:00 -0500
Subject: [PATCH 38/39] Introduce sampling parameters

---
 multimodal/qwen3-vl/README.md                 |  8 ++
 .../qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py | 86 ++++++++++++++++---
 .../qwen3-vl/src/mlperf_inf_mm_q3vl/task.py   | 35 ++++++--
 3 files changed, 108 insertions(+), 21 deletions(-)

diff --git a/multimodal/qwen3-vl/README.md b/multimodal/qwen3-vl/README.md
index f99b0d0f15..37274b2f45 100644
--- a/multimodal/qwen3-vl/README.md
+++ b/multimodal/qwen3-vl/README.md
@@ -241,6 +241,14 @@ bash submit.sh --help
     - Both the `train` and the `test` splits are used and concatenated in that order.
     - Total number of samples: `48289`.
   - Guided decoding is not used.
+  - Sampling parameters:
+    - Frequency penalty: `0.0` 
+    - Presence penalty: `0.0`
+    - Temperature: `1.0`
+    - Top-P: `1.0`
+    - Top-K: `0`
+    - Min-P: `0.0`
+    - Repetition penalty: `1.0`
   - Constraints:
     - Model quality:
       - Category Hierarchical F1 Score >= `0.7824`. This is the 99% recovery of 
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
index 8f16ba9715..5a2eb88610 100644
--- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
@@ -366,8 +366,7 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float |
-                        str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -412,10 +411,8 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.sample_concatenate_permutation = self.sample_concatenate_permutation
 
         # Test duration settings
-        settings.min_duration_ms = round(
-            self.min_duration.total_seconds() * 1000)
-        settings.max_duration_ms = round(
-            self.max_duration.total_seconds() * 1000)
+        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
+        settings.max_duration_ms = round(self.max_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
         settings.max_query_count = self.max_query_count
 
@@ -442,10 +439,8 @@ def to_lgtype(self) -> lg.TestSettings:
             self.performance_sample_count_override
         )
         settings.use_token_latencies = self.use_token_latencies
-        settings.ttft_latency = round(
-            self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(
-            self.server_tpot_latency.total_seconds() * 1e9)
+        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
         settings.infer_token_latencies = self.infer_token_latencies
         settings.token_latency_scaling_factor = self.token_latency_scaling_factor
 
@@ -655,6 +650,71 @@ class Verbosity(StrEnum):
     """The info verbosity level (default)."""
 
 
+class SamplingParams(BaseModelWithAttributeDescriptionsFromDocstrings):
+    """Specifies the sampling parameters for the inference request to the endpoint."""
+
+    frequency_penalty: float = 0.0
+    """Number between -2.0 and 2.0. Positive values penalize new tokens based on their
+    existing frequency in the text so far, decreasing the model's likelihood to repeat
+    the same line verbatim. See
+    https://platform.openai.com/docs/api-reference/chat/create#chat_create-frequency_penalty
+    """
+
+    presence_penalty: float = 0.0
+    """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether
+    they appear in the text so far, increasing the model's likelihood to talk about new
+    topics. See
+    https://platform.openai.com/docs/api-reference/chat/create#chat_create-presence_penalty
+    """
+
+    temperature: float = 1.0
+    """What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
+    make the output more random, while lower values like 0.2 will make it more focused
+    and deterministic. We generally recommend altering this or top_p but not both. See
+    https://platform.openai.com/docs/api-reference/chat/create#chat_create-temperature 
+    """
+
+    top_p: float = 1.0
+    """An alternative to sampling with temperature, called nucleus sampling, where the
+    model considers the results of the tokens with top_p probability mass. So 0.1 means
+    only the tokens comprising the top 10% probability mass are considered. We generally
+    recommend altering this or temperature but not both.
+    See https://platform.openai.com/docs/api-reference/chat/create#chat_create-top_p
+    """
+
+    top_k: int = 0
+    """Controls the number of top tokens to consider. Set to 0 (or -1) to
+    consider all tokens.
+    Note that this is not part of the OpenAI API spec. Therefore, this field will be
+    passed in via the `extra_body` field of the inference request to the endpoint.
+    The inference engine therefore needs to support this field, such as what vLLM does
+    here:
+    https://github.com/vllm-project/vllm/blob/83a317f650f210b86572b13b8198b7d38aaacb7e/vllm/entrypoints/openai/protocol.py#L566
+    """
+
+    min_p: float = 0.0
+    """Represents the minimum probability for a token to be considered,
+    relative to the probability of the most likely token. Must be in [0, 1].
+    Set to 0 to disable this.
+    Note that this is not part of the OpenAI API spec. Therefore, this field will be
+    passed in via the `extra_body` field of the inference request to the endpoint.
+    The inference engine therefore needs to support this field, such as what vLLM does
+    here:
+    https://github.com/vllm-project/vllm/blob/83a317f650f210b86572b13b8198b7d38aaacb7e/vllm/entrypoints/openai/protocol.py#L567
+    """
+
+    repetition_penalty: float = 1.0
+    """Penalizes new tokens based on whether they appear in the prompt and the
+    generated text so far. Values > 1 encourage the model to use new tokens,
+    while values < 1 encourage the model to repeat tokens.
+    Note that this is not part of the OpenAI API spec. Therefore, this field will be
+    passed in via the `extra_body` field of the inference request to the endpoint.
+    The inference engine therefore needs to support this field, such as what vLLM does
+    here:
+    https://github.com/vllm-project/vllm/blob/83a317f650f210b86572b13b8198b7d38aaacb7e/vllm/entrypoints/openai/protocol.py#L568
+    """
+
+
 class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings):
     """Specifies the OpenAI API endpoint to use for the Qwen3-VL (Q3VL) benchmark."""
 
@@ -684,6 +744,9 @@ class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings):
     not be sufficient for the offline scenario.
     """
 
+    sampling_params: SamplingParams
+    """The sampling parameters to use for the inference request to the endpoint."""
+
 
 class EndpointToDeploy(Endpoint):
     """Specifies the endpoint to deploy for the Qwen3-VL (Q3VL) benchmark."""
@@ -794,6 +857,5 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(
-                    message["content"])  # type: ignore[arg-type]
+                message["content"] = list(message["content"])  # type: ignore[arg-type]
         return messages
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
index f05147acc6..af76c77dc7 100644
--- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
@@ -67,8 +67,7 @@ def __init__(
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
             http_client=DefaultAioHttpClient(
-                timeout=httpx.Timeout(
-                    timeout=request_timeout_seconds, connect=5.0),
+                timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0),
             ),
             api_key=endpoint.api_key,
             timeout=request_timeout_seconds,
@@ -188,9 +187,7 @@ def estimated_num_performance_samples(self) -> int:
         """
         estimation_indices = random.sample(
             range(self.total_num_samples),
-            k=min(
-                MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES,
-                self.total_num_samples),
+            k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples),
         )
         estimation_samples = [
             self.formulate_loaded_sample(
@@ -277,8 +274,7 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             _unload_samples_from_ram,
         )
 
-    async def _query_endpoint_async_batch(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         try:
             sample = self.loaded_samples[query_sample.index]
@@ -306,6 +302,17 @@ async def _query_endpoint_async_batch(
                     if sample.response_format is not None
                     else None
                 ),
+                frequency_penalty=self.endpoint.sampling_params.frequency_penalty,
+                presence_penalty=self.endpoint.sampling_params.presence_penalty,
+                temperature=self.endpoint.sampling_params.temperature,
+                top_p=self.endpoint.sampling_params.top_p,
+                extra_body={
+                    "top_k": self.endpoint.sampling_params.top_k,
+                    "min_p": self.endpoint.sampling_params.min_p,
+                    "repetition_penalty": (
+                        self.endpoint.sampling_params.repetition_penalty
+                    ),
+                },
             )
             logger.debug(
                 "Received response (ID: {}) from endpoint after {} seconds.",
@@ -355,8 +362,7 @@ async def _query_endpoint_async_batch(
                 ],
             )
 
-    async def _query_endpoint_async_stream(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         ttft_set = False
         try:
@@ -387,6 +393,17 @@ async def _query_endpoint_async_stream(
                     if sample.response_format is not None
                     else None
                 ),
+                frequency_penalty=self.endpoint.sampling_params.frequency_penalty,
+                presence_penalty=self.endpoint.sampling_params.presence_penalty,
+                temperature=self.endpoint.sampling_params.temperature,
+                top_p=self.endpoint.sampling_params.top_p,
+                extra_body={
+                    "top_k": self.endpoint.sampling_params.top_k,
+                    "min_p": self.endpoint.sampling_params.min_p,
+                    "repetition_penalty": (
+                        self.endpoint.sampling_params.repetition_penalty
+                    ),
+                },
             )
             # iterate asynchronously
             total_tokens = 0

From 472471f89d5b36a498bd2b791f70dd4685e3650b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sat, 20 Dec 2025 01:33:31 +0000
Subject: [PATCH 39/39] [Automated Commit] Format Codebase

---
 .../qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py | 20 ++++++++++++-------
 .../qwen3-vl/src/mlperf_inf_mm_q3vl/task.py   | 13 ++++++++----
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
index 5a2eb88610..9462aeedd3 100644
--- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py
@@ -366,7 +366,8 @@ class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings):
         mode="before",
     )
     @classmethod
-    def parse_timedelta(cls, value: timedelta | float | str) -> timedelta | str:
+    def parse_timedelta(cls, value: timedelta | float |
+                        str) -> timedelta | str:
         """Parse timedelta from seconds (int/float/str) or ISO 8601 format."""
         if isinstance(value, timedelta):
             return value
@@ -411,8 +412,10 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.sample_concatenate_permutation = self.sample_concatenate_permutation
 
         # Test duration settings
-        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
-        settings.max_duration_ms = round(self.max_duration.total_seconds() * 1000)
+        settings.min_duration_ms = round(
+            self.min_duration.total_seconds() * 1000)
+        settings.max_duration_ms = round(
+            self.max_duration.total_seconds() * 1000)
         settings.min_query_count = self.min_query_count
         settings.max_query_count = self.max_query_count
 
@@ -439,8 +442,10 @@ def to_lgtype(self) -> lg.TestSettings:
             self.performance_sample_count_override
         )
         settings.use_token_latencies = self.use_token_latencies
-        settings.ttft_latency = round(self.server_ttft_latency.total_seconds() * 1e9)
-        settings.tpot_latency = round(self.server_tpot_latency.total_seconds() * 1e9)
+        settings.ttft_latency = round(
+            self.server_ttft_latency.total_seconds() * 1e9)
+        settings.tpot_latency = round(
+            self.server_tpot_latency.total_seconds() * 1e9)
         settings.infer_token_latencies = self.infer_token_latencies
         settings.token_latency_scaling_factor = self.token_latency_scaling_factor
 
@@ -671,7 +676,7 @@ class SamplingParams(BaseModelWithAttributeDescriptionsFromDocstrings):
     """What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
     make the output more random, while lower values like 0.2 will make it more focused
     and deterministic. We generally recommend altering this or top_p but not both. See
-    https://platform.openai.com/docs/api-reference/chat/create#chat_create-temperature 
+    https://platform.openai.com/docs/api-reference/chat/create#chat_create-temperature
     """
 
     top_p: float = 1.0
@@ -857,5 +862,6 @@ def ensure_content_is_list(
                 == "pydantic_core._pydantic_core"
                 and message["content"].__class__.__name__ == "ValidatorIterator"
             ):
-                message["content"] = list(message["content"])  # type: ignore[arg-type]
+                message["content"] = list(
+                    message["content"])  # type: ignore[arg-type]
         return messages
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
index af76c77dc7..ddcd962ea1 100644
--- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
+++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
@@ -67,7 +67,8 @@ def __init__(
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
             http_client=DefaultAioHttpClient(
-                timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0),
+                timeout=httpx.Timeout(
+                    timeout=request_timeout_seconds, connect=5.0),
             ),
             api_key=endpoint.api_key,
             timeout=request_timeout_seconds,
@@ -187,7 +188,9 @@ def estimated_num_performance_samples(self) -> int:
         """
         estimation_indices = random.sample(
             range(self.total_num_samples),
-            k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples),
+            k=min(
+                MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES,
+                self.total_num_samples),
         )
         estimation_samples = [
             self.formulate_loaded_sample(
@@ -274,7 +277,8 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             _unload_samples_from_ram,
         )
 
-    async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_batch(
+            self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         try:
             sample = self.loaded_samples[query_sample.index]
@@ -362,7 +366,8 @@ async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> Non
                 ],
             )
 
-    async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_stream(
+            self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         ttft_set = False
         try: