diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index 96396dab92..4e72f542ed 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -312,6 +312,8 @@ PYBIND11_MODULE(mlperf_loadgen, m) { &TestSettings::server_num_issue_query_threads) .def_readwrite("offline_expected_qps", &TestSettings::offline_expected_qps) + .def_readwrite("sample_concatenate_permutation", + &TestSettings::sample_concatenate_permutation) .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms) .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms) .def_readwrite("min_query_count", &TestSettings::min_query_count) @@ -324,6 +326,14 @@ PYBIND11_MODULE(mlperf_loadgen, m) { &TestSettings::accuracy_log_rng_seed) .def_readwrite("accuracy_log_probability", &TestSettings::accuracy_log_probability) + .def_readwrite("accuracy_log_sampling_target", + &TestSettings::accuracy_log_sampling_target) + .def_readwrite("test05", &TestSettings::test05) + .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed) + .def_readwrite("test05_sample_index_rng_seed", + &TestSettings::test05_sample_index_rng_seed) + .def_readwrite("test05_schedule_rng_seed", + &TestSettings::test05_schedule_rng_seed) .def_readwrite("print_timestamps", &TestSettings::print_timestamps) .def_readwrite("performance_issue_unique", &TestSettings::performance_issue_unique) @@ -333,12 +343,6 @@ PYBIND11_MODULE(mlperf_loadgen, m) { &TestSettings::performance_issue_same_index) .def_readwrite("performance_sample_count_override", &TestSettings::performance_sample_count_override) - .def_readwrite("test05", &TestSettings::test05) - .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed) - .def_readwrite("test05_sample_index_rng_seed", - &TestSettings::test05_sample_index_rng_seed) - .def_readwrite("test05_schedule_rng_seed", - &TestSettings::test05_schedule_rng_seed) .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies) .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency) .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency) diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf index 1b825514bd..d21a73a47d 100644 --- a/loadgen/mlperf.conf +++ b/loadgen/mlperf.conf @@ -26,6 +26,7 @@ rgat.*.performance_sample_count_override = 788379 pointpainting.*.performance_sample_count_override = 1024 deepseek-r1.*.performance_sample_count_override = 4388 whisper.*.performance_sample_count_override = 1633 +qwen3-vl-235b-a22b.*.performance_sample_count_override = 48289 # set to 0 to let entire sample set to be performance sample 3d-unet.*.performance_sample_count_override = 0 @@ -67,7 +68,7 @@ llama3_1-8b-edge.*.sample_concatenate_permutation = 1 llama3_1-8b-interactive.*.sample_concatenate_permutation = 1 deepseek-r1.*.sample_concatenate_permutation = 1 whisper.*.sample_concatenate_permutation = 1 - +qwen3-vl-235b-a22b.*.sample_concatenate_permutation = 1 *.Server.target_latency = 10 *.Server.target_latency_percentile = 99 *.Server.target_duration = 0 @@ -91,7 +92,9 @@ llama3_1-8b-edge.*.use_token_latencies = 1 llama3_1-8b-interactive.*.use_token_latencies = 1 deepseek-r1.*.use_token_latencies = 1 whisper.*.use_token_latencies = 1 - +# For the VLM benchmark, the model response is relatively short, therefore we track +# end-to-end latency instead of token latencies. +qwen3-vl-235b-a22b.*.use_token_latencies = 0 # gptj benchmark infers token latencies gptj.*.infer_token_latencies = 1 gptj.*.token_latency_scaling_factor = 69 @@ -132,6 +135,8 @@ deepseek-r1.Server.target_latency = 0 deepseek-r1.Server.ttft_latency = 2000 deepseek-r1.Server.tpot_latency = 80 +qwen3-vl-235b-a22b.Server.target_latency = 12000 + *.Offline.target_latency_percentile = 90 *.Offline.min_duration = 600000 @@ -156,6 +161,7 @@ mixtral-8x7b.Offline.min_query_count = 15000 rgat.Offline.min_query_count = 788379 deepseek-r1.Offline.min_query_count = 4388 whisper.Offline.min_query_count = 1633 +qwen3-vl-235b-a22b.Offline.min_query_count = 48289 # These fields should be defined and overridden by user.conf. *.SingleStream.target_latency = 10 diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h index 584d073bb8..2e092e721d 100644 --- a/loadgen/test_settings.h +++ b/loadgen/test_settings.h @@ -234,10 +234,6 @@ struct TestSettings { uint64_t test05_qsl_rng_seed = 0; uint64_t test05_sample_index_rng_seed = 0; uint64_t test05_schedule_rng_seed = 0; - - /// \brief Load mlperf parameter config from file. - int FromConfig(const std::string &path, const std::string &model, - const std::string &scenario, int conf_type = 1); /**@}*/ // ================================== @@ -272,6 +268,10 @@ struct TestSettings { bool infer_token_latencies = false; uint64_t token_latency_scaling_factor; /**@}*/ + + /// \brief Load mlperf parameter config from file. + int FromConfig(const std::string &path, const std::string &model, + const std::string &scenario, int conf_type = 1); }; /// diff --git a/multimodal/vl2l/.gitignore b/multimodal/qwen3-vl/.gitignore similarity index 100% rename from multimodal/vl2l/.gitignore rename to multimodal/qwen3-vl/.gitignore diff --git a/multimodal/qwen3-vl/README.md b/multimodal/qwen3-vl/README.md new file mode 100644 index 0000000000..37274b2f45 --- /dev/null +++ b/multimodal/qwen3-vl/README.md @@ -0,0 +1,280 @@ +# Reference Implementation for the Qwen3-VL (Q3VL) Benchmark + +## Quick Start + +This guide demonstrates how you can run the benchmark on your local machine. + +### Create a Conda environment + +Follow [this link](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions) +on how to install Miniconda on your host machine. Then, you can create a new conda +environment via: + +```bash +conda create -n mlperf-inf-mm-q3vl python=3.12 +``` + +### Install the Q3VL benchmarking CLI + +#### For users + +Install `mlperf-inf-mm-q3vl` with: + +```bash +pip install git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/ +``` + +#### For developers + +Clone the MLPerf Inference repo via: + +```bash +git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf-inference +``` + +Then enter the repo: + +```bash +cd mlperf-inference/ +``` + +Install `mlperf-inf-mm-q3vl` and the development tools with: + +- On Bash +```bash +pip install -e multimodal/qwen3-vl/[dev] +``` +- On Zsh +```zsh +pip install -e multimodal/qwen3-vl/"[dev]" +``` + +### Post Q3VL benchmarking CLI installation + +After installation, you can check the CLI flags that `mlperf-inf-mm-q3vl` can take with: + +```bash +mlperf-inf-mm-q3vl --help +``` + +You can enable shell autocompletion for `mlperf-inf-mm-q3vl` with: + +```bash +mlperf-inf-mm-q3vl --install-completion +``` + +> [!NOTE] +> Shell auto-completion will take effect once you restart the terminal. + +### Start an inference endpoint on your local host machine with vLLM + +Please refer to [this guide on how to launch vLLM for various Qwen3 VL MoE models](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3-VL.html). + +```bash +docker run --gpus all \ # Use all the GPUs on this host machine. + -v ~/.cache/huggingface:/root/.cache/huggingface \ # Use the HuggingFace cache from your host machine. + -p 8000:8000 \ # This assumes the endpoint will use port 8000. + --ipc=host \ # The container can access and utilize the host's IPC mechanisms (e.g., shared memory). + vllm/vllm-openai:nightly \ # You can also use the `:latest` container or a specific release. + --model Qwen/Qwen3-VL-235B-A22B-Instruct \ # Specifies the model for vLLM to deploy. + --tensor-parallel-size 8 \ # 8-way tensor-parallel inference across 8 GPUs. + --limit-mm-per-prompt.video 0 # The input requests will contain images only (i.e., no videos). +``` + +### Run the benchmark for the Offline scenario + +Performance only mode: + +```bash +mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario offline --settings.test.mode performance_only +``` + +Accuracy only mode: + +```bash +mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario offline --settings.test.mode accuracy_only +``` + +### Run the benchmark for the Server scenario + +Performance only mode: + +```bash +mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario server --settings.test.mode performance_only +``` + +Accuracy only mode: + +```bash +mlperf-inf-mm-q3vl benchmark endpoint --settings.test.scenario server --settings.test.mode accuracy_only +``` + +### Pass in `user.conf` + +You can pass in a `user.conf` file through `--settings.user_conf.path`, such that the +LoadGen parameters provided through the CLI will be overridden by the `user.conf` +provided by you and the `mlperf.conf` inside the LoadGen. An example `user.conf` file +is included: [example_user.conf](./example_user.conf). As such, you can run the +benchmark with `user.conf` via: + +```bash +mlperf-inf-mm-q3vl benchmark endpoint \ + --settings.test.scenario \ + --settings.test.mode \ + --settings.user_conf.path example_user.conf +``` + +### Evalute the response quality + +You should pass the `mlperf_log_accuracy.json` file (generated by LoadGen) to the +`--filename` flag of the `mlperf-inf-mm-q3vl evaluate` command. + +```bash +mlperf-inf-mm-q3vl evaluate --filename output/mlperf_log_accuracy.json +``` + +## Docker + +[docker/](docker/) provides examples of Dockerfiles that install the Q3VL benchmarking +CLI into the container images of the inference engine. This is useful when you have to +run both the inference engine and the Q3VL benchmarking CLI inside the same container, +for example, in a situation where you must use a GPU cluster managed by +[Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and +[pyxis](https://github.com/NVIDIA/pyxis). + +As an illustrative example, assuming that you are at the root directory of the MLPerf +Inference repo: + +1. You can build a container image against the vLLM's +`vllm/vllm-openai:v0.12.0` release by + +```bash +docker build \ + --build-arg BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0 \ + --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=multimodal/qwen3-vl \ + -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \ + -t mlperf-inf-mm-q3vl:vllm-openai-v0.12.0 \ + . +``` +> [!NOTE] +> `MLPERF_INF_MM_Q3VL_INSTALL_URL` can also take in a remote GitHub location, such as +> `git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/`. + +2. Afterwards, you can start the container in the interactive mode by + +```bash +docker run --rm -it --gpus all -v ~/.cache:/root/.cache --ipc=host mlperf-inf-mm-q3vl:vllm-openai-v0.12.0 +``` + +### Benchmark against vLLM inside the container + +If you are running `mlperf-inf-mm-q3vl` inside a local environment that has access to +vLLM (such as inside a container that was created using the +[docker/vllm-cuda.Dockerfile](docker/vllm-cuda.Dockerfile)), you can use a single +`mlperf-inf-mm-q3vl benchmark vllm` command to achieve: + +1. Deploy an endpoint using vLLM. +2. Wait for the endpoint to be healthy. +3. Run the benchmark against that endpoint. + +For example, inside the container, you can run the Offline scenario Accuracy only +mode with: + +```bash +mlperf-inf-mm-q3vl benchmark vllm \ + --settings.test.scenario offline \ + --settings.test.mode accuracy_only \ + --settings.user_conf.path example_user.conf \ + --vllm.cli=--async-scheduling \ + --vllm.cli=--max-model-len=32768 \ + --vllm.cli=--max-num-seqs=1024 \ + --vllm.cli=--compilation-config='{ + "cudagraph_capture_sizes": [ + 1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, + 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, + 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, + 496, 512, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768 + ] + }' \ + --vllm.cli=--limit-mm-per-prompt.video=0 \ + --vllm.cli=--tensor-parallel-size=8 +``` + +## Slurm + +[scripts/slurm/](scripts/slurm/) provide example scripts of running both the benchmark +and the response quality evaluation in a GPU cluster managed by +[Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and +[pyxis](https://github.com/NVIDIA/pyxis). Specifically, + +- [scripts/slurm/benchmark.sh](scripts/slurm/benchmark.sh) is a sbatch script that + runs the benchmarking job. +- [scripts/slurm/evaluate.sh](scripts/slurm/evaluate.sh) is a sbatch script that runs + the evaluation job. +- [scripts/slurm/submit.sh](scripts/slurm/submit.sh) is a Bash script that submits both + jobs, where the evaluation job would only run if the benchmarking job has succeeded. + +You can check the CLI flags that [scripts/slurm/submit.sh](scripts/slurm/submit.sh) can +take via: + +```bash +bash submit.sh --help +``` + +> [!NOTE] +> Slurm clusters are often highly customized per organization. If you are unfamiliar +> with Slurm, you should check with the cluster administrator of your organization +> first, get a good understanding of what those example scripts do, and adapt the +> example scripts to the specific settings for the Slurm cluster that you are going +> to use, before you try to launch any jobs. + +## Reference Implementation Specification + +- v6.0 Round + - vLLM version: [v0.12.0](https://github.com/vllm-project/vllm/releases/tag/v0.12.0) + - Model: + - [Qwen/Qwen3-VL-235B-A22B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct) + - Commit SHA: [710c13861be6c466e66de3f484069440b8f31389](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct/tree/710c13861be6c466e66de3f484069440b8f31389) + - Dataset: + - [Shopify/product-catalogue](https://huggingface.co/datasets/Shopify/product-catalogue) + - Commit SHA: [d5c517c509f5aca99053897ef1de797d6d7e5aa5](https://huggingface.co/datasets/Shopify/product-catalogue/tree/d5c517c509f5aca99053897ef1de797d6d7e5aa5) + - Both the `train` and the `test` splits are used and concatenated in that order. + - Total number of samples: `48289`. + - Guided decoding is not used. + - Sampling parameters: + - Frequency penalty: `0.0` + - Presence penalty: `0.0` + - Temperature: `1.0` + - Top-P: `1.0` + - Top-K: `0` + - Min-P: `0.0` + - Repetition penalty: `1.0` + - Constraints: + - Model quality: + - Category Hierarchical F1 Score >= `0.7824`. This is the 99% recovery of + `0.7903037` which is the mean category hierarchical F1 score across 10 runs on + [the BF16 version of the model](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct). + The standard deviation across those 10 runs is `0.0002250412555`. + - Server Scenario: + - Target latency is used as the constraint, instead of Time to First Token (TTFT) + or Time per Output Token (TPOT) latencies. + - Target latency percentile = `0.99`. + - Target latency $\le$ 12 seconds. + - Offline Scenario: + - Number of samples in the query $\ge$ `48289` (i.e., every sample in the entire + dataset would be send to the VLM endpoint at least once). + - Performance sample count: `48289` (i.e., the entire dataset will be loaded into + the host memory, which takes ~6.39 GB). + - Testing duration $\ge$ 10 mins. + - Sample concatenation permutation is enabled. + + +## Developer Guide + +### Linting + +You can lint the Q3VL benchmark source code by running the following script: + +```bash +bash multimodal/qwen3-vl/scripts/linters.sh +``` \ No newline at end of file diff --git a/multimodal/vl2l/docker/vllm-cuda.Dockerfile b/multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile similarity index 65% rename from multimodal/vl2l/docker/vllm-cuda.Dockerfile rename to multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile index 0c7597ce76..a54bda1364 100644 --- a/multimodal/vl2l/docker/vllm-cuda.Dockerfile +++ b/multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile @@ -9,33 +9,33 @@ # docker build -t myimage . # # 2. Install from a different git URL or branch: -# docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=git+https://github.com/USER/REPO.git@BRANCH#subdirectory=multimodal/vl2l \ +# docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=git+https://github.com/USER/REPO.git@BRANCH#subdirectory=multimodal/qwen3-vl \ # -t myimage . # # 3. Install from local directory (build from repo root with git auto-detection): # (Version number will be auto-detected from git if the build context includes .git) -# docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=multimodal/vl2l \ -# -f multimodal/vl2l/docker/vllm-cuda.Dockerfile \ +# docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=multimodal/qwen3-vl \ +# -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \ # -t myimage . # -# 4. Install from local directory (build from multimodal/vl2l subdirectory): +# 4. Install from local directory (build from multimodal/qwen3-vl subdirectory): # (No .git in subdirectory, will use fallback version "0.0.0.dev0") -# docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=. \ -# -f multimodal/vl2l/docker/vllm-cuda.Dockerfile \ -# -t myimage multimodal/vl2l +# docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=. \ +# -f multimodal/qwen3-vl/docker/vllm-cuda.Dockerfile \ +# -t myimage multimodal/qwen3-vl # -# 5. Install from local directory when pwd is already multimodal/vl2l: +# 5. Install from local directory when pwd is already multimodal/qwen3-vl: # (No .git in subdirectory, will use fallback version "0.0.0.dev0") -# cd multimodal/vl2l -# docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=. \ +# cd multimodal/qwen3-vl +# docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=. \ # -f docker/vllm-cuda.Dockerfile \ # -t myimage . # # 6. Install from local directory with a custom fallback version: # (Override the default "0.0.0.dev0" version when git is not available) -# cd multimodal/vl2l -# docker build --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=. \ -# --build-arg MLPERF_INF_MM_VL2L_VERSION=1.0.0 \ +# cd multimodal/qwen3-vl +# docker build --build-arg MLPERF_INF_MM_Q3VL_INSTALL_URL=. \ +# --build-arg MLPERF_INF_MM_Q3VL_VERSION=1.0.0 \ # -f docker/vllm-cuda.Dockerfile \ # -t myimage . # @@ -45,29 +45,29 @@ # # ============================================================================ -ARG BASE_IMAGE_URL=vllm/vllm-openai:nightly +ARG BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0 FROM ${BASE_IMAGE_URL} -# MLPERF_INF_MM_VL2L_INSTALL_URL can be either: +# MLPERF_INF_MM_Q3VL_INSTALL_URL can be either: # 1. A git URL (default): git+https://github.com/... -# 2. A local directory path relative to the build context (e.g., multimodal/vl2l) +# 2. A local directory path relative to the build context (e.g., multimodal/qwen3-vl) # Note: The build context is the directory you pass to `docker build` (the final arg) -# MLPERF_INF_MM_VL2L_INSTALL_URL must be a valid path inside that build context -ARG MLPERF_INF_MM_VL2L_INSTALL_URL=git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/vl2l +# MLPERF_INF_MM_Q3VL_INSTALL_URL must be a valid path inside that build context +ARG MLPERF_INF_MM_Q3VL_INSTALL_URL=git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl # Temporary directory inside the container where the build context will be copied # Only used when installing from a local directory path -ARG BUILD_CONTEXT_DIR=/tmp/mm_vl2l_build_context +ARG BUILD_CONTEXT_DIR=/tmp/mm_q3vl_build_context # Fallback version to use when building from local directory without git metadata # setuptools-scm will first try to detect version from .git, and use this as fallback # Must be a valid PEP 440 version string (e.g., "0.0.0.dev0", "1.0.0", "0.1.0.dev1") # Can be overridden at build time with --build-arg -ARG MLPERF_INF_MM_VL2L_VERSION=0.0.0.dev0 +ARG MLPERF_INF_MM_Q3VL_VERSION=0.0.0.dev0 # Install # - git (required for installing "git+..." dependencies to work) -# - tmux (for `vllm serve` and `mlperf-inf-mm-vl2l` in different tmux sessions) +# - tmux (for `vllm serve` and `mlperf-inf-mm-q3vl` in different tmux sessions) # - vim (for editing files in the container) RUN apt-get update && \ apt-get install -y git tmux vim && \ @@ -79,25 +79,25 @@ RUN apt-get update && \ #ENV LD_LIBRARY_PATH=/usr/local/lib/python3.12/dist-packages/torch/lib:$LD_LIBRARY_PATH # Copy build context. -# This will be used only if MLPERF_INF_MM_VL2L_INSTALL_URL is a local path. +# This will be used only if MLPERF_INF_MM_Q3VL_INSTALL_URL is a local path. COPY . ${BUILD_CONTEXT_DIR}/ -# Install the mlperf-inference-multimodal-vl2l package. +# Install the mlperf-inference-multimodal-q3vl package. # We use --system to install into the container's global python environment. -# Detect if MLPERF_INF_MM_VL2L_INSTALL_URL is a git URL or a local path: -RUN if echo "${MLPERF_INF_MM_VL2L_INSTALL_URL}" | grep -q "^git+"; then \ - echo "Installing from git URL: ${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \ - uv pip install --system --no-cache --verbose "${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \ +# Detect if MLPERF_INF_MM_Q3VL_INSTALL_URL is a git URL or a local path: +RUN if echo "${MLPERF_INF_MM_Q3VL_INSTALL_URL}" | grep -q "^git+"; then \ + echo "Installing from git URL: ${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \ + uv pip install --system --no-cache --verbose "${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \ else \ - echo "Installing from local path: ${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \ + echo "Installing from local path: ${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \ # Check if the package directory is inside a git repository \ - if cd "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_VL2L_INSTALL_URL}" && git rev-parse --git-dir > /dev/null 2>&1; then \ + if cd "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_Q3VL_INSTALL_URL}" && git rev-parse --git-dir > /dev/null 2>&1; then \ echo "Git repository detected, setuptools-scm will detect version automatically"; \ else \ - echo "Not in a git repository, using fallback version: ${MLPERF_INF_MM_VL2L_VERSION}"; \ - export SETUPTOOLS_SCM_PRETEND_VERSION_FOR_MLPERF_INFERENCE_MULTIMODAL_VL2L="${MLPERF_INF_MM_VL2L_VERSION}"; \ + echo "Not in a git repository, using fallback version: ${MLPERF_INF_MM_Q3VL_VERSION}"; \ + export SETUPTOOLS_SCM_PRETEND_VERSION_FOR_MLPERF_INF_MM_Q3VL="${MLPERF_INF_MM_Q3VL_VERSION}"; \ fi; \ - uv pip install --system --no-cache --verbose "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_VL2L_INSTALL_URL}"; \ + uv pip install --system --no-cache --verbose "${BUILD_CONTEXT_DIR}/${MLPERF_INF_MM_Q3VL_INSTALL_URL}"; \ fi && \ rm -rf "${BUILD_CONTEXT_DIR}" diff --git a/multimodal/qwen3-vl/example_user.conf b/multimodal/qwen3-vl/example_user.conf new file mode 100644 index 0000000000..615c92fe67 --- /dev/null +++ b/multimodal/qwen3-vl/example_user.conf @@ -0,0 +1,7 @@ +*.Offline.target_qps = 80.4816666667 +*.Offline.min_duration = 600000 +*.Offline.min_query_count = 48289 + +*.Server.target_qps = 5.0 +*.Server.min_duration = 600000 +*.Server.min_query_count = 48289 \ No newline at end of file diff --git a/multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb b/multimodal/qwen3-vl/notebooks/shopify-global-catalogue.ipynb similarity index 99% rename from multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb rename to multimodal/qwen3-vl/notebooks/shopify-global-catalogue.ipynb index d973c74014..682398ac37 100644 --- a/multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb +++ b/multimodal/qwen3-vl/notebooks/shopify-global-catalogue.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "f194bfdf-c9f1-4738-bdb5-258dd4bc05f0", "metadata": {}, "outputs": [], @@ -29,7 +29,7 @@ "from io import BytesIO\n", "import base64\n", "import pprint\n", - "from mlperf_inference_multimodal_vl2l.task import Task\n", + "from mlperf_inf_mm_q3vl.task import Task\n", "from openai import AsyncOpenAI, DefaultAioHttpClient\n", "import numpy as np\n", "import json\n", @@ -451,12 +451,12 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "b6aa7372", "metadata": {}, "outputs": [], "source": [ - "from mlperf_inference_multimodal_vl2l.task import ProductMetadata\n", + "from mlperf_inf_mm_q3vl.task import ProductMetadata\n", "\n", "def build_messages(sample):\n", " image_file = BytesIO()\n", diff --git a/multimodal/vl2l/pyproject.toml b/multimodal/qwen3-vl/pyproject.toml similarity index 82% rename from multimodal/vl2l/pyproject.toml rename to multimodal/qwen3-vl/pyproject.toml index 1d1d90ec75..255b3f4e16 100644 --- a/multimodal/vl2l/pyproject.toml +++ b/multimodal/qwen3-vl/pyproject.toml @@ -1,6 +1,6 @@ [project] -name = "mlperf-inference-multimodal-vl2l" -description = "The reference implementation for the vision-language-to-language (VL2L) benchmark in MLPerf Inference" +name = "mlperf-inf-mm-q3vl" +description = "The reference implementation for the Qwen3-VL (Q3VL) benchmark in MLPerf Inference" readme = "README.md" classifiers = [ "Programming Language :: Python :: 3", @@ -30,10 +30,10 @@ dynamic = ["version"] dev = ["black", "ruff", "mypy", "shellcheck-py", "pytest"] [project.scripts] -mlperf-inf-mm-vl2l = "mlperf_inference_multimodal_vl2l.cli:app" +mlperf-inf-mm-q3vl = "mlperf_inf_mm_q3vl.cli:app" [project.urls] -Homepage = "https://github.com/mlcommons/inference/multimodal/vl2l" +Homepage = "https://github.com/mlcommons/inference/multimodal/qwen3-vl" [build-system] requires = ["setuptools>=80", "setuptools-scm[simple]>=8"] @@ -43,7 +43,7 @@ build-backend = "setuptools.build_meta" where = ["src"] [tool.setuptools.package-data] -"mlperf_inference_multimodal_vl2l" = ["py.typed"] +"mlperf_inf_mm_q3vl" = ["py.typed"] [tool.setuptools_scm] root = "../../" diff --git a/multimodal/vl2l/scripts/linters.sh b/multimodal/qwen3-vl/scripts/linters.sh similarity index 100% rename from multimodal/vl2l/scripts/linters.sh rename to multimodal/qwen3-vl/scripts/linters.sh diff --git a/multimodal/qwen3-vl/scripts/slurm/benchmark.sh b/multimodal/qwen3-vl/scripts/slurm/benchmark.sh new file mode 100644 index 0000000000..00167cd3b3 --- /dev/null +++ b/multimodal/qwen3-vl/scripts/slurm/benchmark.sh @@ -0,0 +1,29 @@ +#!/bin/bash +#SBATCH --time=4:00:00 +#SBATCH --partition=batch +#SBATCH --tasks=1 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --exclusive +#SBATCH --output=benchmark-slurm-output-%j.txt +#SBATCH --error=benchmark-slurm-error-%j.txt + +set -eux +set -o pipefail + +mkdir -p "${OUTPUT_HOST_DIR}"/"${SLURM_JOB_ID}" + +srun \ + --container-image="${CONTAINER_IMAGE}" \ + --container-mounts="${CACHE_HOST_DIR}":"${CACHE_CONTAINER_DIR}","${OUTPUT_HOST_DIR}":"${OUTPUT_CONTAINER_DIR}" \ + --no-container-mount-home \ + mlperf-inf-mm-q3vl benchmark vllm \ + --settings.test.scenario="${SCENARIO}" \ + --settings.test.mode="${MODE}" \ + --settings.test.server_expected_qps="${SERVER_EXPECTED_QPS}" \ + --vllm.model.repo_id="${MODEL_REPO_ID}" \ + --vllm.cli=--async-scheduling \ + --vllm.cli=--max-model-len=32768 \ + --vllm.cli=--limit-mm-per-prompt.video=0 \ + --vllm.cli=--tensor-parallel-size="${TENSOR_PARALLEL_SIZE}" \ + --settings.logging.log_output.outdir="${OUTPUT_CONTAINER_DIR}"/"${SLURM_JOB_ID}" \ No newline at end of file diff --git a/multimodal/qwen3-vl/scripts/slurm/evaluate.sh b/multimodal/qwen3-vl/scripts/slurm/evaluate.sh new file mode 100644 index 0000000000..54615f2e33 --- /dev/null +++ b/multimodal/qwen3-vl/scripts/slurm/evaluate.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH --time=1:00:00 +#SBATCH --partition=cpu_short +#SBATCH --nodes=1 +#SBATCH --tasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem-per-cpu=16G +#SBATCH --output=evaluate-slurm-output-%j.txt +#SBATCH --error=evaluate-slurm-error-%j.txt + +set -eux +set -p pipefail + +srun \ + --container-image="${CONTAINER_IMAGE}" \ + --container-mounts="${CACHE_HOST_DIR}":"${CACHE_CONTAINER_DIR}","${OUTPUT_HOST_DIR}":"${OUTPUT_CONTAINER_DIR}" \ + --no-container-mount-home \ + --container-env=NVIDIA_VISIBLE_DEVICES \ + mlperf-inf-mm-q3vl evaluate \ + --filename="${OUTPUT_CONTAINER_DIR}"/"${BENCHMARK_JOB_ID}"/mlperf_log_accuracy.json \ No newline at end of file diff --git a/multimodal/qwen3-vl/scripts/slurm/submit.sh b/multimodal/qwen3-vl/scripts/slurm/submit.sh new file mode 100644 index 0000000000..8e07336d7f --- /dev/null +++ b/multimodal/qwen3-vl/scripts/slurm/submit.sh @@ -0,0 +1,229 @@ +#!/bin/bash + +set -eux +set -o pipefail + +DEFAULT_CONTAINER_IMAGE="" +container_image=${DEFAULT_CONTAINER_IMAGE} + +DEFAULT_MODEL_REPO_ID=Qwen/Qwen3-VL-235B-A22B-Instruct +model_repo_id=${DEFAULT_MODEL_REPO_ID} + +DEFAULT_SCENARIO=offline +scenario=${DEFAULT_SCENARIO} + +DEFAULT_MODE=accuracy_only +mode=${DEFAULT_MODE} + +DEFAULT_SERVER_EXPECTED_QPS=5 +server_expected_qps=${DEFAULT_SERVER_EXPECTED_QPS} + +DEFAULT_TENSOR_PARALLEL_SIZE=8 +tensor_parallel_size=${DEFAULT_TENSOR_PARALLEL_SIZE} + +DEFAULT_CACHE_HOST_DIR="" +cache_host_dir=${DEFAULT_CACHE_HOST_DIR} + +DEFAULT_OUTPUT_HOST_DIR=$(pwd)/outputs +output_host_dir=${DEFAULT_OUTPUT_HOST_DIR} + +DEFAULT_SLURM_ACCOUNT="" +slurm_account=${DEFAULT_SLURM_ACCOUNT} + +DEFAULT_BENCHMARK_SLURM_PARTITION="" +benchmark_slurm_partition=${DEFAULT_BENCHMARK_SLURM_PARTITION} + +DEFAULT_EVALUATE_SLURM_PARTITION="" +evaluate_slurm_partition=${DEFAULT_EVALUATE_SLURM_PARTITION} + +function _exit_with_help_msg() { + cat < None: - """Run the VL2L benchmark.""" - logger.info("Running VL2L benchmark with settings: {}", settings) - logger.info("Running VL2L benchmark with dataset: {}", dataset) + """Run the Qwen3-VL (Q3VL) benchmark.""" logger.info( - "Running VL2L benchmark with OpenAI API endpoint: {}", - endpoint) - logger.info("Running VL2L benchmark with random seed: {}", random_seed) + "Running Qwen3-VL (Q3VL) benchmark with settings: {}", + settings) + logger.info("Running Qwen3-VL (Q3VL) benchmark with dataset: {}", dataset) + logger.info( + "Running Qwen3-VL (Q3VL) benchmark with OpenAI API endpoint: {}", + endpoint, + ) + logger.info( + "Running Qwen3-VL (Q3VL) benchmark with random seed: {}", + random_seed) test_settings, log_settings = settings.to_lgtype() task = ShopifyGlobalCatalogue( dataset=dataset, @@ -96,9 +101,9 @@ def _run_benchmark( ) sut = task.construct_sut() qsl = task.construct_qsl() - logger.info("Starting the VL2L benchmark with LoadGen...") + logger.info("Starting the Qwen3-VL (Q3VL) benchmark with LoadGen...") lg.StartTestWithLogSettings(sut, qsl, test_settings, log_settings) - logger.info("The VL2L benchmark with LoadGen completed.") + logger.info("The Qwen3-VL (Q3VL) benchmark with LoadGen completed.") lg.DestroyQSL(qsl) lg.DestroySUT(sut) diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/deploy.py similarity index 80% rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/deploy.py index 8db6acfa8a..66303ba6fd 100644 --- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/deploy.py +++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/deploy.py @@ -16,6 +16,7 @@ from .log import get_log_file_path if TYPE_CHECKING: + from pathlib import Path from types import TracebackType from .schema import EndpointToDeploy, Settings, VllmEndpoint @@ -100,11 +101,17 @@ def _startup(self) -> None: """ raise NotImplementedError + @abstractmethod + def _failfast(self) -> None: + """Raise an exception if the endpoint is already detected to be dead.""" + raise NotImplementedError + def _wait_for_ready(self) -> None: """Wait for the endpoint to be ready.""" health_url = self.endpoint.url.rstrip("/v1") + "/health" start_time = time.time() while time.time() - start_time < self.endpoint.startup_timeout.total_seconds(): + self._failfast() logger.info( "Waiting {:0.2f} seconds for endpoint to be ready...", time.time() - start_time, @@ -134,6 +141,31 @@ def _shutdown(self) -> None: raise NotImplementedError +class LocalProcessNotStartedError(RuntimeError): + """The exception raised when the local process is not started yet.""" + + def __init__(self) -> None: + """Initialize the exception.""" + super().__init__("Local process is not started yet.") + + +class LocalProcessDeadError(RuntimeError): + """The exception raised when the local process is already detected to be dead.""" + + def __init__( + self, + returncode: int, + stdout_file_path: Path, + stderr_file_path: Path, + ) -> None: + """Initialize the exception.""" + super().__init__( + f"Local process has already terminated with return code {returncode}. " + f"Please check the logs in {stdout_file_path} and " + f"{stderr_file_path} for more details.", + ) + + class LocalProcessDeployer(EndpointDeployer): """Deploy and manage an endpoint that is powered by a local process.""" @@ -146,6 +178,14 @@ def __init__(self, endpoint: EndpointToDeploy, settings: Settings) -> None: """ super().__init__(endpoint=endpoint, settings=settings) self._process: subprocess.Popen | None = None + self._stdout_file_path = get_log_file_path( + key=self._stdout_log_file_key, + settings=self.settings, + ) + self._stderr_file_path = get_log_file_path( + key=self._stderr_log_file_key, + settings=self.settings, + ) @abstractmethod def _build_command(self) -> list[str]: @@ -172,34 +212,38 @@ def _startup(self) -> None: "Starting local process with environment variables: {}", os.environ) - # Get log file paths - stdout_file_path = get_log_file_path( - key=self._stdout_log_file_key, - settings=self.settings, - ) - stderr_file_path = get_log_file_path( - key=self._stderr_log_file_key, - settings=self.settings, - ) - # Start the server process = subprocess.Popen( # noqa: S603 cmd, - stdout=stdout_file_path.open("w"), - stderr=stderr_file_path.open("w"), + stdout=self._stdout_file_path.open("w"), + stderr=self._stderr_file_path.open("w"), text=True, ) logger.info("Started local process with PID: {}", process.pid) logger.info( "Local process stdout will be logged to: {}", - stdout_file_path) + self._stdout_file_path, + ) logger.info( "Local process stderr will be logged to: {}", - stderr_file_path) + self._stderr_file_path, + ) self._process = process + def _failfast(self) -> None: + """Raise an exception if the local process is already detected to be dead.""" + if self._process is None: + raise LocalProcessNotStartedError + returncode = self._process.poll() + if returncode is not None: + raise LocalProcessDeadError( + returncode=returncode, + stdout_file_path=self._stdout_file_path, + stderr_file_path=self._stderr_file_path, + ) + def _shutdown(self) -> None: """Shut down the local process gracefully.""" if self._process is None: @@ -256,12 +300,17 @@ def _build_command(self) -> list[str]: "vllm", "serve", self.endpoint.model.repo_id, + "--revision", + self.endpoint.model.revision, "--host", host, "--port", str(port), ] + if self.endpoint.model.token: + cmd.extend(["--hf-token", self.endpoint.model.token]) + # Add API key if provided if self.endpoint.api_key: cmd.extend(["--api-key", self.endpoint.api_key]) diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py similarity index 63% rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py index 2076bdbab8..7bf59ce302 100644 --- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py +++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py @@ -1,14 +1,15 @@ -"""Task definitions for the VL2L benchmark.""" +"""Task definitions for the Qwen3-VL (Q3VL) benchmark.""" from __future__ import annotations import json +import os +from concurrent.futures import ProcessPoolExecutor from pathlib import Path from typing import TYPE_CHECKING import numpy as np from datasets import load_dataset -from hiclass.metrics import f1 # type: ignore[import-untyped] from loguru import logger from pydantic import ValidationError from rapidfuzz import fuzz # type: ignore[import-untyped] @@ -16,17 +17,21 @@ from tabulate import tabulate if TYPE_CHECKING: + from typing import Any + from pydantic import FilePath from .cli import Dataset as DatasetCLI from .schema import ProductMetadata -_TRUE_CATEGORY_PAD = "<|__TRUE_CATEGORY_PAD__|>" _PRED_CATEGORY_PAD = "<|__PRED_CATEGORY_PAD__|>" _PRED_BRAND_PAD = "<|__PRED_BRAND_PAD__|>" _CATEGORY_SEPARATOR = " > " +_WORKER_CONTEXT = {} +_MAX_JOBS = 4 + def get_hierarchical_components( predicted_path: str, @@ -159,63 +164,42 @@ def calculate_secondhand_f1(data: list[tuple[bool, bool]]) -> float: return f1_score(y_src, y_pred) -def calculate_hiclass_f1( - data: list[tuple[str, str]], - separator: str = _CATEGORY_SEPARATOR, -) -> float: - """Alt method to calculate hierarchical F1. +def _process_chunk_rnd_brand(args: tuple[str, dict, dict]) -> tuple[str, str]: + """Function to process only chunks for random brand predictions. Args: - data: List of tuples of predicted and true values - separator: The separator used to split the paths into levels of the category. - - Returs: - f1 score + args: Tuple containing """ - y_pred_raw = [] - y_true_raw = [] + pred_brand, elem, data_source = args + # We pass the specific data row needed, or the whole structure if efficient + return (pred_brand, data_source[elem["qsl_idx"]]["ground_truth_brand"]) - for pred, src in data: - path1 = pred.split(separator) - path2 = src.split(separator) - y_pred_raw.append(path1) - y_true_raw.append(path2) +def init_worker(dataset: dict) -> None: + """Initialize worker data to process each chunk. - # 2. Find the global maximum length across ALL samples - # We check the longest path in both true and pred lists - max_len = max(len(p) for p in y_true_raw + y_pred_raw) + Args: + dataset: huggingface dataset + """ + _WORKER_CONTEXT["dataset"] = dataset - # 3. Pad all lists to the global max_len - for i in range(len(y_true_raw)): - # Pad Truth - pad_len_true = max_len - len(y_true_raw[i]) - y_true_raw[i] += [_TRUE_CATEGORY_PAD] * pad_len_true - # Pad Prediction - pad_len_pred = max_len - len(y_pred_raw[i]) - y_pred_raw[i] += [_PRED_CATEGORY_PAD] * pad_len_pred +def _process_chunk(args: tuple[list[dict], int]) -> dict[str, Any]: + """Retrieve relevant information from each chunk of data. - # 4. Convert to numpy arrays - y_true = np.array(y_true_raw) - y_pred = np.array(y_pred_raw) + Args: + args: Tuple that contains chunk of data and seed - # 5. Calculate Score - return f1(y_true, y_pred) + Returns: + Object with processed information + """ + chunk_data, seed = args + # 1. Access the global dataset + dataset = _WORKER_CONTEXT["dataset"] -def run_evaluation(random_seed: int, filename: FilePath, - dataset: DatasetCLI) -> None: - """Main function to run the evaluation.""" - rng = np.random.default_rng(seed=random_seed) - with Path.open(filename) as f: - model_output = json.load(f) - - original_data = load_dataset( - dataset.repo_id, - token=dataset.token, - split="+".join(dataset.split), - ) + # 2. Create a local, reproducible RNG for this specific chunk + local_rng = np.random.default_rng(seed) num_unparsable_responses = 0 category_dataset_pred_src = [] @@ -223,13 +207,13 @@ def run_evaluation(random_seed: int, filename: FilePath, is_secondhand_pred_src = [] is_secondhand_rand_pred_src = [] brand_pred_src = [] - all_possible_brands = set() + error_messages = [] - for elem in model_output: + for elem in chunk_data: idx = elem["qsl_idx"] response = bytes.fromhex(elem["data"]).decode("utf-8") - ground_truth_item = original_data[idx] + ground_truth_item = dataset[idx] all_possible_brands.add(ground_truth_item["ground_truth_brand"]) try: pred_item = ProductMetadata.model_validate_json(response) @@ -245,14 +229,15 @@ def run_evaluation(random_seed: int, filename: FilePath, ), ), brand=_PRED_BRAND_PAD, - is_secondhand=rng.choice([True, False], size=1).tolist()[0], + is_secondhand=local_rng.choice( + [True, False], size=1).tolist()[0], ) - logger.error( - "Response\n{}\n(for the sample at index {}) cannot be validated against" - " the expected schema. Overwriting this response into \n{}\n", - response, - idx, - pred_item, + error_messages.append( + ( + f"Response\n{response}\n(for the sample at index {idx})" + f"cannot be validated against the expected schema. " + f"Overwriting this response into \n{pred_item}\n", + ), ) category_dataset_pred_src.append( (pred_item.category, ground_truth_item["ground_truth_category"]), @@ -268,35 +253,122 @@ def run_evaluation(random_seed: int, filename: FilePath, ) # random category selection # Uniform distribution is the default - rand_cat = rng.choice( + rand_cat = local_rng.choice( ground_truth_item["potential_product_categories"]) category_rand_pred_src.append( (rand_cat, ground_truth_item["ground_truth_category"]), ) # random is_secondhand selection - rand_is_secondhand = rng.choice([True, False]) + rand_is_secondhand = local_rng.choice([True, False]) is_secondhand_rand_pred_src.append( (rand_is_secondhand, ground_truth_item["ground_truth_is_secondhand"]), ) + return { + "num_unparsable_responses": num_unparsable_responses, + "error_messages": error_messages, + "category_dataset_pred_src": category_dataset_pred_src, + "category_rand_pred_src": category_rand_pred_src, + "is_secondhand_pred_src": is_secondhand_pred_src, + "is_secondhand_rand_pred_src": is_secondhand_rand_pred_src, + "brand_pred_src": brand_pred_src, + "all_possible_brands": list(all_possible_brands), + } + + +def run_evaluation(random_seed: int, filename: FilePath, + dataset: DatasetCLI) -> None: + """Main function to run the evaluation.""" + master_rng = np.random.default_rng(seed=random_seed) + with Path.open(filename) as f: + model_output = json.load(f) + + original_data = load_dataset( + dataset.repo_id, + token=dataset.token, + split="+".join(dataset.split), + ) + + # get number of available CPU and get chunk size + cpu_count = min(os.cpu_count() or 1, _MAX_JOBS) + chunk_size = max(len(model_output) // cpu_count, 1) + # Create chunks + output_chunks = [ + model_output[i: i + chunk_size] + for i in range(0, len(model_output), chunk_size) + ] + + # Generate Seeds + # One seed per chunk to ensure reproducibility. + # The master_rng generates these, + # so the whole run is deterministic based on `random_seed`. + chunk_seeds = master_rng.integers(0, 2**32, size=len(output_chunks)) + + # Zip them: Each task is ([model_out_1, ...], 12345) + tasks = zip(output_chunks, chunk_seeds, strict=False) + + num_unparsable_responses = 0 + err_messages = [] + category_dataset_pred_src = [] + category_rand_pred_src = [] + is_secondhand_pred_src = [] + is_secondhand_rand_pred_src = [] + brand_pred_src = [] + all_possible_brands = [] + + with ProcessPoolExecutor( + max_workers=cpu_count, + initializer=init_worker, + initargs=(original_data,), + ) as executor: + # Execute + chunk_results = list(executor.map(_process_chunk, tasks)) + + for chunk in chunk_results: + num_unparsable_responses += chunk["num_unparsable_responses"] + err_messages.extend(chunk["error_messages"]) + category_dataset_pred_src.extend(chunk["category_dataset_pred_src"]) + category_rand_pred_src.extend(chunk["category_rand_pred_src"]) + is_secondhand_pred_src.extend(chunk["is_secondhand_pred_src"]) + is_secondhand_rand_pred_src.extend( + chunk["is_secondhand_rand_pred_src"]) + brand_pred_src.extend(chunk["brand_pred_src"]) + all_possible_brands.extend(chunk["all_possible_brands"]) + + for err in err_messages: + logger.error("{}", err) + category_f1_score = calculate_hierarchical_f1(category_dataset_pred_src) - hiclass_f1_score = calculate_hiclass_f1(category_dataset_pred_src) is_secondhand_f1_score = calculate_secondhand_f1(is_secondhand_pred_src) brand_score = calculate_brand_f1_score(brand_pred_src) rand_cat_f1_score = calculate_hierarchical_f1(category_rand_pred_src) - rand_hiclass_f1_score = calculate_hiclass_f1(category_rand_pred_src) + rand_is_seconhand_f1_score = calculate_secondhand_f1( is_secondhand_rand_pred_src) + + all_brands_list = list(set(all_possible_brands)) + random_brand_predictions = master_rng.choice( + all_brands_list, + size=len(model_output), + ) + + args_list = ( + (pred, elem, original_data) + for pred, elem in zip(random_brand_predictions, model_output, strict=False) + ) + + with ProcessPoolExecutor() as executor: + rand_brand_data = list( + executor.map( + _process_chunk_rnd_brand, + args_list, + chunksize=chunk_size), + ) + rand_brand_score = calculate_brand_f1_score( - [ - ( - rng.choice(list(all_possible_brands)), - original_data[elem["qsl_idx"]]["ground_truth_brand"], - ) - for elem in model_output - ], + rand_brand_data, ) logger.info( @@ -307,14 +379,12 @@ def run_evaluation(random_seed: int, filename: FilePath, [ "From accuracy file", category_f1_score, - hiclass_f1_score, brand_score, is_secondhand_f1_score, ], [ "Random selection", rand_cat_f1_score, - rand_hiclass_f1_score, rand_brand_score, rand_is_seconhand_f1_score, ], @@ -322,7 +392,6 @@ def run_evaluation(random_seed: int, filename: FilePath, headers=[ "Results", "Category hierarchical F1 Score", - "Category HiClass F1 Score", "Brand F1 Score", "Is_secondhand F1 Score", ], diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/log.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py similarity index 95% rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/log.py rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py index a24eb514f0..56700ef7d8 100644 --- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/log.py +++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/log.py @@ -1,4 +1,4 @@ -"""Logging utilities for the VL2L benchmark.""" +"""Logging utilities for the Qwen3-VL (Q3VL) benchmark.""" from __future__ import annotations diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/py.typed b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/py.typed similarity index 100% rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/py.typed rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/py.typed diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py new file mode 100644 index 0000000000..9462aeedd3 --- /dev/null +++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/schema.py @@ -0,0 +1,867 @@ +"""Schema definitions of various data structures in the Qwen3-VL (Q3VL) benchmark.""" + +from __future__ import annotations + +from datetime import timedelta +from enum import StrEnum, auto +from pathlib import Path +from typing import Annotated, ClassVar, Self + +import mlperf_loadgen as lg +from loguru import logger +from openai.types import ResponseFormatJSONSchema +from openai.types.chat import ChatCompletionMessageParam +from pydantic import ( + BaseModel, + ConfigDict, + DirectoryPath, + Field, + FilePath, + NonNegativeInt, + field_validator, + model_validator, +) + +MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES = 100 +ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES = 1 * 1024 * 1024 * 1024 # 1GB + + +class TestScenario(StrEnum): + """The test scenario for the MLPerf inference LoadGen.""" + + SERVER = auto() + """Run the benchmark in server/interactive scenario.""" + + OFFLINE = auto() + """Run the benchmark in offline/batch scenario.""" + + def to_lgtype(self) -> lg.TestScenario: + """Convert the test scenario to its corresponding LoadGen type.""" + match self: + case TestScenario.SERVER: + return lg.TestScenario.Server + case TestScenario.OFFLINE: + return lg.TestScenario.Offline + case _: + raise UnknownTestScenarioValueError(self) + + @staticmethod + def from_lgtype(lgtype: lg.TestScenario) -> TestScenario: + """Convert the LoadGen's test scenario to the TestScenario schema.""" + match lgtype: + case lg.TestScenario.Server: + return TestScenario.SERVER + case lg.TestScenario.Offline: + return TestScenario.OFFLINE + case _: + raise UnknownTestScenarioValueError(lgtype) + + +class UnknownTestScenarioValueError(ValueError): + """The exception raised when an unknown test scenario is encountered.""" + + def __init__(self, test_scenario: TestScenario | lg.TestScenario) -> None: + """Initialize the exception.""" + super().__init__(f"Unknown test scenario: {test_scenario}") + + +class TestMode(StrEnum): + """The test mode for the MLPerf inference LoadGen.""" + + PERFORMANCE_ONLY = auto() + """Run the benchmark to evaluate performance.""" + + ACCURACY_ONLY = auto() + """Run the benchmark to evaluate model quality.""" + + def to_lgtype(self) -> lg.TestMode: + """Convert the test mode to its corresponding LoadGen type.""" + match self: + case TestMode.PERFORMANCE_ONLY: + return lg.TestMode.PerformanceOnly + case TestMode.ACCURACY_ONLY: + return lg.TestMode.AccuracyOnly + case _: + raise UnknownTestModeValueError(self) + + @staticmethod + def from_lgtype(lgtype: lg.TestMode) -> TestMode: + """Convert the LoadGen's test mode to the TestMode schema.""" + match lgtype: + case lg.TestMode.PerformanceOnly: + return TestMode.PERFORMANCE_ONLY + case lg.TestMode.AccuracyOnly: + return TestMode.ACCURACY_ONLY + case _: + raise UnknownTestModeValueError(lgtype) + + +class UnknownTestModeValueError(ValueError): + """The exception raised when an unknown test mode is encountered.""" + + def __init__(self, test_mode: TestMode | lg.TestMode) -> None: + """Initialize the exception.""" + super().__init__(f"Unknown test mode: {test_mode}") + + +class LoggingMode(StrEnum): + """Specifies when logging should be sampled and stringified.""" + + ASYNC_POLL = auto() + """ Logs are serialized and output on an IOThread that polls for new logs + at a fixed interval. This is the only mode currently implemented.""" + + END_OF_TEST_ONLY = auto() + """ Not implemented """ + + SYNCHRONOUS = auto() + """ Not implemented """ + + def to_lgtype(self) -> lg.LoggingMode: + """Convert logging mode to its corresponding LoadGen type.""" + match self: + case LoggingMode.ASYNC_POLL: + return lg.LoggingMode.AsyncPoll + case _: + raise UnknownLoggingModeValueError(self) + + +class UnknownLoggingModeValueError(ValueError): + """The exception raised when an unknown logging mode is encountered.""" + + def __init__(self, logging_mode: LoggingMode) -> None: + """Initialize the exception.""" + super().__init__(f"Unknown logging mode: {logging_mode}") + + +class BaseModelWithAttributeDescriptionsFromDocstrings(BaseModel): + """Base model that automatically adds attribute descriptions from docstrings.""" + + model_config = ConfigDict(use_attribute_docstrings=True, extra="forbid") + """Pydantic settings for + - Automatically add the attribute descriptions from docstrings. + - Forbid extra attributes. + """ + + +_DEFAULT_DATASET_SIZE = 48289 +_DEFAULT_MIN_DURATION = timedelta(minutes=10) +_DEFAULT_OFFLINE_EXPECTED_QPS = ( + _DEFAULT_DATASET_SIZE / _DEFAULT_MIN_DURATION.total_seconds() +) + + +class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings): + """The test settings for the MLPerf inference LoadGen.""" + + scenario: TestScenario = TestScenario.OFFLINE + """The MLPerf inference benchmarking scenario to run the benchmark in.""" + + mode: TestMode = TestMode.PERFORMANCE_ONLY + """Whether you want to run the benchmark for performance measurement or accuracy + evaluation. + """ + + """Server-specific settings""" + + server_target_qps: float = 5 + """The average QPS of the poisson distribution. Note: This field is used as a + FindPeakPerformance's lower bound. When you run FindPeakPerformanceMode, you should + make sure that this value satisfies performance constraints. + """ + + server_target_latency: timedelta = timedelta(seconds=12) + """The latency constraint for the Server scenario.""" + + server_target_latency_percentile: float = 0.99 + """The latency percentile for server mode. This value is combined with + server_target_latency to determine if a run is valid. + """ + + server_coalesce_queries: bool = False + """If this flag is set to True, LoadGen will combine samples from + multiple queries into a single query if their scheduled issue times have + passed. + """ + + server_find_peak_qps_decimals_of_precision: int = 1 + """The decimal places of QPS precision used to terminate + FindPeakPerformance mode. + """ + + server_find_peak_qps_boundary_step_size: float = 1 + """The step size (as a fraction of the QPS) used to widen the lower and + upper bounds to find the initial boundaries of binary search. + """ + + server_max_async_queries: int = 0 + """The maximum number of outstanding queries to allow before earlying out from a + performance run. Useful for performance tuning and speeding up the + FindPeakPerformance mode. + """ + + server_num_issue_query_threads: int = 0 + """The number of issue query threads that will be registered and used + to call SUT's IssueQuery(). If this is 0, the same thread calling + StartTest() will be used to call IssueQuery(). See also + mlperf::RegisterIssueQueryThread(). + """ + + """Offline-specific settings""" + + offline_expected_qps: float = _DEFAULT_OFFLINE_EXPECTED_QPS + """Specifies the QPS the SUT expects to hit for the offline load. + The LoadGen generates 10% more queries than it thinks it needs to meet + the minimum test duration. + """ + + sample_concatenate_permutation: bool = True + """Affects the order in which the samples of the dataset are chosen. + If False, it concatenates a single permutation of the dataset (or part + of it depending on performance_sample_count_override) several times up to the + number of samples requested. + If True, it concatenates a multiple permutation of the dataset (or a + part of it depending on `performance_sample_count_override`) several times + up to the number of samples requested. + """ + + """Test duration settings""" + + min_duration: timedelta = _DEFAULT_MIN_DURATION + """The minimum testing duration (in seconds or ISO 8601 format like `PT5S`). The + benchmark runs until this value has been met. + """ + + max_duration: timedelta = timedelta(seconds=0) + """The maximum testing duration (in seconds or ISO 8601 format like `PT5S`). The + benchmark will exit before this value has been met. 0 means infinity. + """ + + min_query_count: int = _DEFAULT_DATASET_SIZE + """The minimum testing query count. The benchmark runs until this value has been + met. If min_query_count is less than the total number of samples in the dataset, + only the first min_query_count samples will be used during testing. + """ + + max_query_count: int = 0 + """The maximum testing query count. The benchmark will exit before this value has + been met. 0 means infinity. + """ + + """Random number generation settings""" + + qsl_rng_seed: int = 0 + """Affects which subset of samples from the QSL are chosen for + the performance sample set and accuracy sample sets.""" + + sample_index_rng_seed: int = 0 + """Affects the order in which samples from the performance set will + be included in queries.""" + + schedule_rng_seed: int = 0 + """Affects the poisson arrival process of the Server scenario. + Different seeds will appear to "jitter" the queries + differently in time, but should not affect the average issued QPS. + """ + + accuracy_log_rng_seed: int = 0 + """Affects which samples have their query returns logged to the + accuracy log in performance mode.""" + + accuracy_log_probability: float = 0.0 + """The probability of the query response of a sample being logged to the + accuracy log in performance mode.""" + + accuracy_log_sampling_target: int = 0 + """The target number of samples that will have their results printed to + accuracy log in performance mode for compliance testing.""" + + """Test05 settings""" + + test05: bool = False + """Whether or not to run test05.""" + + test05_qsl_rng_seed: int = 0 + """Test05 seed for which subset of samples from the QSL are chosen for + the performance sample set and accuracy sample sets.""" + + test05_sample_index_rng_seed: int = 0 + """Test05 seed for the order in which samples from the performance set will + be included in queries.""" + + test05_schedule_rng_seed: int = 0 + """Test05 seed for the poisson arrival process of the Server scenario. + Different seeds will appear to "jitter" the queries + differently in time, but should not affect the average issued QPS. + """ + + """Performance Sample modifiers""" + + print_timestamps: bool = False + """Prints measurement interval start and stop timestamps to stdout + for the purpose of comparison against an external timer.""" + + performance_issue_unique: bool = False + """Allows issuing only unique queries in Performance mode of any + scenario. This can be used to send non-repeat & hence unique + samples to SUT. + """ + + performance_issue_same: bool = False + """If True, the same query is chosen repeatedley for Inference. + In offline scenario, the query is filled with the same sample. + """ + + performance_issue_same_index: int = 0 + """Offset to control which sample is repeated in + performance_issue_same mode. Value should be within [0, performance_sample_count). + """ + + performance_sample_count_override: Annotated[ + NonNegativeInt, + Field( + description="The number of samples to use for the performance test. In the " # noqa: S608 + "performance mode, the benchmark will select P random samples from the " + "dataset, then send enough queries using these P samples (and repeating " + "them if necessary) to reach the min_duration and min_query_count. If a " + "non-zero value is passed to this flag, the P will be this value. " + "Otherwise, the benchmark will estimate how many samples can be loaded into" + f" {ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES} bytes of memory " + "based on the memory footprint of randomly selected " + f"{MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES} samples (at most), and then" + " use this estimation as the value P.", + ), + ] = _DEFAULT_DATASET_SIZE + + use_token_latencies: bool = False + """By default, the Server scenario will use `server_target_latency` as the + constraint. When set to True, the Server scenario will use `server_ttft_latency` and + `server_tpot_latency` as the constraint. + """ + + server_ttft_latency: timedelta = timedelta(milliseconds=100) + """Time to First Token (TTFT) latency constraint result validation (used when + use_token_latencies is enabled). + """ + + server_tpot_latency: timedelta = timedelta(milliseconds=100) + """Time per Output Token (TPOT) latency constraint result validation (used when + use_token_latencies is enabled). + """ + + infer_token_latencies: bool = False + """Infer token latencies from the response time.""" + + token_latency_scaling_factor: int = 1 + """Only used when infer_token_latencies is enabled. The scaling factor inferring + token latencies from the response time. + """ + + @field_validator( + "server_target_latency", + "min_duration", + "max_duration", + "server_ttft_latency", + "server_tpot_latency", + mode="before", + ) + @classmethod + def parse_timedelta(cls, value: timedelta | float | + str) -> timedelta | str: + """Parse timedelta from seconds (int/float/str) or ISO 8601 format.""" + if isinstance(value, timedelta): + return value + if isinstance(value, (int, float)): + return timedelta(seconds=value) + if isinstance(value, str): + # Try to parse as a number first + try: + return timedelta(seconds=float(value)) + except ValueError: + # If it fails, it might be ISO 8601 format + # Let pydantic's default parser handle it + pass + return value + + def to_lgtype(self) -> lg.TestSettings: + """Convert the test settings to its corresponding LoadGen type.""" + settings = lg.TestSettings() + settings.scenario = self.scenario.to_lgtype() + settings.mode = self.mode.to_lgtype() + + # Server-specific settings + settings.server_target_qps = self.server_target_qps + settings.server_target_latency_ns = round( + self.server_target_latency.total_seconds() * 1e9, + ) + settings.server_target_latency_percentile = ( + self.server_target_latency_percentile + ) + settings.server_coalesce_queries = self.server_coalesce_queries + settings.server_find_peak_qps_decimals_of_precision = ( + self.server_find_peak_qps_decimals_of_precision + ) + settings.server_find_peak_qps_boundary_step_size = ( + self.server_find_peak_qps_boundary_step_size + ) + settings.server_max_async_queries = self.server_max_async_queries + settings.server_num_issue_query_threads = self.server_num_issue_query_threads + + # Offline-specific settings + settings.offline_expected_qps = self.offline_expected_qps + settings.sample_concatenate_permutation = self.sample_concatenate_permutation + + # Test duration settings + settings.min_duration_ms = round( + self.min_duration.total_seconds() * 1000) + settings.max_duration_ms = round( + self.max_duration.total_seconds() * 1000) + settings.min_query_count = self.min_query_count + settings.max_query_count = self.max_query_count + + # Random number generation settings + settings.qsl_rng_seed = self.qsl_rng_seed + settings.sample_index_rng_seed = self.sample_index_rng_seed + settings.schedule_rng_seed = self.schedule_rng_seed + settings.accuracy_log_rng_seed = self.accuracy_log_rng_seed + settings.accuracy_log_probability = self.accuracy_log_probability + settings.accuracy_log_sampling_target = self.accuracy_log_sampling_target + + # Test05 settings + settings.test05 = self.test05 + settings.test05_qsl_rng_seed = self.test05_qsl_rng_seed + settings.test05_sample_index_rng_seed = self.test05_sample_index_rng_seed + settings.test05_schedule_rng_seed = self.test05_schedule_rng_seed + + # Performance Sample modifiers + settings.print_timestamps = self.print_timestamps + settings.performance_issue_unique = self.performance_issue_unique + settings.performance_issue_same = self.performance_issue_same + settings.performance_issue_same_index = self.performance_issue_same_index + settings.performance_sample_count_override = ( + self.performance_sample_count_override + ) + settings.use_token_latencies = self.use_token_latencies + settings.ttft_latency = round( + self.server_ttft_latency.total_seconds() * 1e9) + settings.tpot_latency = round( + self.server_tpot_latency.total_seconds() * 1e9) + settings.infer_token_latencies = self.infer_token_latencies + settings.token_latency_scaling_factor = self.token_latency_scaling_factor + + return settings + + @staticmethod + def from_lgtype(lgtype: lg.TestSettings) -> TestSettings: + """Convert the LoadGen's test settings to the TestSettings schema.""" + return TestSettings( + scenario=TestScenario.from_lgtype(lgtype.scenario), + mode=TestMode.from_lgtype(lgtype.mode), + server_target_qps=lgtype.server_target_qps, + server_target_latency=timedelta( + seconds=lgtype.server_target_latency_ns / 1e9, + ), + server_target_latency_percentile=lgtype.server_target_latency_percentile, + server_coalesce_queries=lgtype.server_coalesce_queries, + server_find_peak_qps_decimals_of_precision=lgtype.server_find_peak_qps_decimals_of_precision, + server_find_peak_qps_boundary_step_size=lgtype.server_find_peak_qps_boundary_step_size, + server_max_async_queries=lgtype.server_max_async_queries, + server_num_issue_query_threads=lgtype.server_num_issue_query_threads, + offline_expected_qps=lgtype.offline_expected_qps, + sample_concatenate_permutation=lgtype.sample_concatenate_permutation, + min_duration=timedelta(milliseconds=lgtype.min_duration_ms), + max_duration=timedelta(milliseconds=lgtype.max_duration_ms), + min_query_count=lgtype.min_query_count, + max_query_count=lgtype.max_query_count, + qsl_rng_seed=lgtype.qsl_rng_seed, + sample_index_rng_seed=lgtype.sample_index_rng_seed, + schedule_rng_seed=lgtype.schedule_rng_seed, + accuracy_log_rng_seed=lgtype.accuracy_log_rng_seed, + accuracy_log_probability=lgtype.accuracy_log_probability, + accuracy_log_sampling_target=lgtype.accuracy_log_sampling_target, + test05=lgtype.test05, + test05_qsl_rng_seed=lgtype.test05_qsl_rng_seed, + test05_sample_index_rng_seed=lgtype.test05_sample_index_rng_seed, + test05_schedule_rng_seed=lgtype.test05_schedule_rng_seed, + print_timestamps=lgtype.print_timestamps, + performance_issue_unique=lgtype.performance_issue_unique, + performance_issue_same=lgtype.performance_issue_same, + performance_issue_same_index=lgtype.performance_issue_same_index, + performance_sample_count_override=lgtype.performance_sample_count_override, + use_token_latencies=lgtype.use_token_latencies, + server_ttft_latency=timedelta(seconds=lgtype.ttft_latency / 1e9), + server_tpot_latency=timedelta(seconds=lgtype.tpot_latency / 1e9), + infer_token_latencies=lgtype.infer_token_latencies, + token_latency_scaling_factor=lgtype.token_latency_scaling_factor, + ) + + +class LogOutputSettings(BaseModelWithAttributeDescriptionsFromDocstrings): + """The test log output settings for the MLPerf inference LoadGen.""" + + outdir: DirectoryPath = DirectoryPath("./output") + """Where to save the output files from the benchmark.""" + + prefix: str = "mlperf_log_" + """Modify the filenames of the logs with a prefix.""" + + suffix: str = "" + """Modify the filenames of the logs with a suffix.""" + + prefix_with_datetime: bool = False + """Modify the filenames of the logs with a datetime.""" + + copy_detail_to_stdout: bool = False + """Print details of performance test to stdout.""" + + copy_summary_to_stdout: bool = True + """Print results of performance test to terminal.""" + + @field_validator("outdir", mode="before") + @classmethod + def parse_directory_field(cls, value: str) -> Path: + """Verify and create the output directory to store log files.""" + path = Path(value) + path.mkdir(exist_ok=True) + return path + + def to_lgtype(self) -> lg.LogOutputSettings: + """Convert the log output settings to its corresponding LoadGen type.""" + log_output_settings = lg.LogOutputSettings() + log_output_settings.outdir = self.outdir.as_posix() + log_output_settings.prefix = self.prefix + log_output_settings.suffix = self.suffix + log_output_settings.prefix_with_datetime = self.prefix_with_datetime + log_output_settings.copy_detail_to_stdout = self.copy_detail_to_stdout + log_output_settings.copy_summary_to_stdout = self.copy_summary_to_stdout + return log_output_settings + + +class LogSettings(BaseModelWithAttributeDescriptionsFromDocstrings): + """The test log settings for the MLPerf inference LoadGen.""" + + log_output: LogOutputSettings = LogOutputSettings() + """Log output settings""" + + log_mode: LoggingMode = LoggingMode.ASYNC_POLL + """How and when logging should be sampled and stringified at runtime""" + + enable_trace: bool = True + """Enable trace""" + + def to_lgtype(self) -> lg.LogSettings: + """Convert log settings to its corresponding LoadGen type.""" + log_settings = lg.LogSettings() + log_settings.log_output = self.log_output.to_lgtype() + log_settings.log_mode = self.log_mode.to_lgtype() + log_settings.enable_trace = self.enable_trace + return log_settings + + +class UserConf(BaseModelWithAttributeDescriptionsFromDocstrings): + """The user.conf file for specifying LoadGen test settings.""" + + path: FilePath | None = None + """The path to the user.conf file. If provided, the test settings will be overridden + with the settings from the provided user.conf file and the mlperf.conf file from + inside LoadGen. + """ + + model: str = "qwen3-vl-235b-a22b" + """The model name that corresponds to the entries in the mlperf.conf file (in the + LoadGen) which defines the benchmark-wide constraints. + """ + + +class Settings(BaseModelWithAttributeDescriptionsFromDocstrings): + """Combine the settings for the test and logging of LoadGen.""" + + test: TestSettings + """Test settings parameters.""" + + user_conf: UserConf + """The user.conf file for specifying LoadGen test settings.""" + + logging: LogSettings + """Test logging parameters.""" + + @model_validator(mode="after") + def override_test_settings_from_user_conf(self) -> Self: + """Override the test settings from the user.conf file.""" + if self.user_conf.path: + lg_test_settings = self.test.to_lgtype() + lg_test_settings.FromConfig( + str(self.user_conf.path), + self.user_conf.model, + self.test.scenario.value.capitalize(), + ) + self.test = TestSettings.from_lgtype(lg_test_settings) + logger.info( + "Loaded test settings from the user.conf and mlperf.conf files: {}", + self.test, + ) + return self + + def to_lgtype(self) -> tuple[lg.TestSettings, lg.LogSettings]: + """Return test and log settings for LoadGen.""" + test_settings = self.test.to_lgtype() + log_settings = self.logging.to_lgtype() + return (test_settings, log_settings) + + +class Model(BaseModelWithAttributeDescriptionsFromDocstrings): + """Specifies the model to use for the Qwen3-VL (Q3VL) benchmark.""" + + repo_id: str = "Qwen/Qwen3-VL-235B-A22B-Instruct" + """The HuggingFace repository ID of the model.""" + + token: str | None = None + """The token to access the HuggingFace repository of the model.""" + + revision: str = "710c13861be6c466e66de3f484069440b8f31389" + """The revision of the model.""" + + +class Dataset(BaseModelWithAttributeDescriptionsFromDocstrings): + """Specifies a dataset on HuggingFace.""" + + repo_id: str = "Shopify/product-catalogue" + """The HuggingFace repository ID of the dataset.""" + + token: str | None = None + """The token to access the HuggingFace repository of the dataset.""" + + revision: str = "d5c517c509f5aca99053897ef1de797d6d7e5aa5" + """The revision of the dataset.""" + + split: list[str] = ["train", "test"] + """Dataset splits to use for the benchmark, e.g., "train" and "test". You can add + multiple splits by repeating the same CLI flag multiple times, e.g.: + --dataset.split test --dataset.split train + The testing dataset is a concatenation of these splits in the same order. + """ + + +class Verbosity(StrEnum): + """The verbosity level of the logger.""" + + TRACE = auto() + """The trace verbosity level.""" + + DEBUG = auto() + """The debug verbosity level.""" + + INFO = auto() + """The info verbosity level (default).""" + + +class SamplingParams(BaseModelWithAttributeDescriptionsFromDocstrings): + """Specifies the sampling parameters for the inference request to the endpoint.""" + + frequency_penalty: float = 0.0 + """Number between -2.0 and 2.0. Positive values penalize new tokens based on their + existing frequency in the text so far, decreasing the model's likelihood to repeat + the same line verbatim. See + https://platform.openai.com/docs/api-reference/chat/create#chat_create-frequency_penalty + """ + + presence_penalty: float = 0.0 + """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether + they appear in the text so far, increasing the model's likelihood to talk about new + topics. See + https://platform.openai.com/docs/api-reference/chat/create#chat_create-presence_penalty + """ + + temperature: float = 1.0 + """What sampling temperature to use, between 0 and 2. Higher values like 0.8 will + make the output more random, while lower values like 0.2 will make it more focused + and deterministic. We generally recommend altering this or top_p but not both. See + https://platform.openai.com/docs/api-reference/chat/create#chat_create-temperature + """ + + top_p: float = 1.0 + """An alternative to sampling with temperature, called nucleus sampling, where the + model considers the results of the tokens with top_p probability mass. So 0.1 means + only the tokens comprising the top 10% probability mass are considered. We generally + recommend altering this or temperature but not both. + See https://platform.openai.com/docs/api-reference/chat/create#chat_create-top_p + """ + + top_k: int = 0 + """Controls the number of top tokens to consider. Set to 0 (or -1) to + consider all tokens. + Note that this is not part of the OpenAI API spec. Therefore, this field will be + passed in via the `extra_body` field of the inference request to the endpoint. + The inference engine therefore needs to support this field, such as what vLLM does + here: + https://github.com/vllm-project/vllm/blob/83a317f650f210b86572b13b8198b7d38aaacb7e/vllm/entrypoints/openai/protocol.py#L566 + """ + + min_p: float = 0.0 + """Represents the minimum probability for a token to be considered, + relative to the probability of the most likely token. Must be in [0, 1]. + Set to 0 to disable this. + Note that this is not part of the OpenAI API spec. Therefore, this field will be + passed in via the `extra_body` field of the inference request to the endpoint. + The inference engine therefore needs to support this field, such as what vLLM does + here: + https://github.com/vllm-project/vllm/blob/83a317f650f210b86572b13b8198b7d38aaacb7e/vllm/entrypoints/openai/protocol.py#L567 + """ + + repetition_penalty: float = 1.0 + """Penalizes new tokens based on whether they appear in the prompt and the + generated text so far. Values > 1 encourage the model to use new tokens, + while values < 1 encourage the model to repeat tokens. + Note that this is not part of the OpenAI API spec. Therefore, this field will be + passed in via the `extra_body` field of the inference request to the endpoint. + The inference engine therefore needs to support this field, such as what vLLM does + here: + https://github.com/vllm-project/vllm/blob/83a317f650f210b86572b13b8198b7d38aaacb7e/vllm/entrypoints/openai/protocol.py#L568 + """ + + +class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings): + """Specifies the OpenAI API endpoint to use for the Qwen3-VL (Q3VL) benchmark.""" + + url: str = "http://localhost:8000/v1" + """The URL of the OpenAI API endpoint that the inference requests are sent to.""" + + api_key: str = "" + """The API key to authenticate the inference requests.""" + + model: Model + """The model to use for the Qwen3-VL (Q3VL) benchmark, i.e., the model that was + deployed behind this OpenAI API endpoint. + """ + + use_guided_decoding: bool = False + """If True, the benchmark will enable guided decoding for the requests. This + requires the endpoint (and the inference engine behind it) to support guided + decoding. If False, the response from the endpoint might not be directly parsable + by the response JSON schema (e.g., the JSON object might be fenced in a + ```json ... ``` code block). + """ + + request_timeout: timedelta = timedelta(hours=2) + """The timeout for the inference request to the endpoint. The default value for + OpenAI API client is 10 minutes + (https://github.com/openai/openai-python?tab=readme-ov-file#timeouts) which might + not be sufficient for the offline scenario. + """ + + sampling_params: SamplingParams + """The sampling parameters to use for the inference request to the endpoint.""" + + +class EndpointToDeploy(Endpoint): + """Specifies the endpoint to deploy for the Qwen3-VL (Q3VL) benchmark.""" + + startup_timeout: timedelta = timedelta(hours=1) + """The timeout for the endpoint to start up.""" + + shutdown_timeout: timedelta = timedelta(minutes=1) + """The timeout for the endpoint to shut down.""" + + poll_interval: timedelta = timedelta(seconds=60) + """The interval to poll the endpoint for readiness.""" + + healthcheck_timeout: timedelta = timedelta(seconds=5) + """The timeout for the healthcheck request to the endpoint.""" + + +class VllmEndpoint(EndpointToDeploy): + """Specifies how to deploy an OpenAI API endpoint in vLLM for benchmarking.""" + + cli: list[str] = [] + """The CLI arguments to pass to `vllm serve`. This excludes vllm's `--host`, + `--port`, --api-key` and `--model` CLI arguments which will be determined by + the `url`, `api_key` and `model` fields of this schema.""" + + @model_validator(mode="after") + def validate_cli(self) -> Self: + """Validate the vllm CLI arguments.""" + for flag in self.cli: + if not flag.startswith(("--", "-")): + raise PositionalVllmCliFlagError(flag) + if flag.split("=", 1)[0] in BlacklistedVllmCliFlagError.BLACKLIST: + raise BlacklistedVllmCliFlagError(flag) + return self + + +class PositionalVllmCliFlagError(ValueError): + """The exception raised when a positional vllm CLI flag is encountered.""" + + def __init__(self, flag: str) -> None: + """Initialize the exception.""" + super().__init__( + f"Positional vllm CLI flag: {flag} is not allowed. Only optional flags are " + "allowed to be passed to `--vllm.cli`.", + ) + + +class BlacklistedVllmCliFlagError(ValueError): + """The exception raised when a blacklisted vllm CLI flag is encountered.""" + + BLACKLIST: ClassVar[list[str]] = [ + "--model", + "--revision", + "--host", + "--port", + "--hf-token", + "--api-key", + ] + + def __init__(self, flag: str) -> None: + """Initialize the exception.""" + super().__init__( + f"Blacklisted vllm CLI flag: {flag} is not allowed. The blacklisted flags" + f"are {self.BLACKLIST}.", + ) + + +class ProductMetadata(BaseModelWithAttributeDescriptionsFromDocstrings): + """Json format for the expected responses from the VLM.""" + + category: str + """The complete category of the product, e.g., + "Clothing & Accessories > Clothing > Shirts > Polo Shirts". + Each categorical level is separated by " > ". + """ + + brand: str + """The brand of the product, e.g., "giorgio armani".""" + + is_secondhand: bool + """True if the product is second-hand, False otherwise.""" + + +class LoadedSample(BaseModelWithAttributeDescriptionsFromDocstrings): + """Sample format to be used by LoadGen.""" + + messages: list[ChatCompletionMessageParam] + """The messages to be sent for chat completion to the VLM inference endpoint.""" + + response_format: ResponseFormatJSONSchema | None = None + """The response format to be used during guided decoding.""" + + @field_validator("messages", mode="after") + @classmethod + def ensure_content_is_list( + cls, + messages: list[ChatCompletionMessageParam], + ) -> list[ChatCompletionMessageParam]: + """If the content is a `ValidatorIterator`, convert it back to a list. + + This is to workaround a Pydantic bug. See + https://github.com/pydantic/pydantic/issues/9467 for more details. + """ + for message in messages: + if ( + "content" in message + and message["content"].__class__.__module__ + == "pydantic_core._pydantic_core" + and message["content"].__class__.__name__ == "ValidatorIterator" + ): + message["content"] = list( + message["content"]) # type: ignore[arg-type] + return messages diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py similarity index 88% rename from multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py rename to multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py index c63a690a32..ddcd962ea1 100644 --- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py +++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py @@ -1,4 +1,4 @@ -"""Task definitions for the VL2L benchmark.""" +"""Task definitions for the Qwen3-VL (Q3VL) benchmark.""" from __future__ import annotations @@ -13,6 +13,7 @@ from io import BytesIO from typing import Any +import httpx import mlperf_loadgen as lg from datasets import load_dataset from loguru import logger @@ -56,17 +57,21 @@ def __init__( revision=dataset.revision, split="+".join(dataset.split), ) - logger.debug( - "Loaded {} samples from the dataset splits {}.", + logger.info( + "Imported {} samples from the dataset splits {}.", len(self.dataset), dataset.split, ) self.endpoint = endpoint + request_timeout_seconds = endpoint.request_timeout.total_seconds() self.openai_api_client = AsyncOpenAI( base_url=endpoint.url, - http_client=DefaultAioHttpClient(), + http_client=DefaultAioHttpClient( + timeout=httpx.Timeout( + timeout=request_timeout_seconds, connect=5.0), + ), api_key=endpoint.api_key, - timeout=endpoint.request_timeout.total_seconds(), + timeout=request_timeout_seconds, ) self.event_loop, self.event_loop_thread = ( self._create_event_loop_in_separate_thread() @@ -98,7 +103,7 @@ async def _cancel_all_tasks() -> None: _cancel_all_tasks(), self.event_loop, ).result(timeout=5.0) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.trace("Error cancelling tasks during cleanup: {}", e) # Try to close the OpenAI client gracefully @@ -107,7 +112,7 @@ async def _cancel_all_tasks() -> None: self.openai_api_client.close(), self.event_loop, ).result(timeout=5.0) - except Exception as e: + except Exception as e: # noqa: BLE001 logger.trace("Error closing OpenAI client during cleanup: {}", e) # Stop the event loop and join the thread @@ -204,8 +209,9 @@ def estimated_num_performance_samples(self) -> int: self.total_num_samples, ) logger.debug( - "Estimated number of performance samples that will be loaded into the host" + "Estimated number of performance samples that can be loaded into {} GB host" " memory before testing is {}.", + ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES / 1024 / 1024 / 1024, result, ) if self.settings.performance_sample_count_override > 0: @@ -226,11 +232,22 @@ def _load_samples_to_ram(query_sample_indices: list[int]) -> None: Args: query_sample_indices: The indices of the samples to load to host memory. """ + logger.info( + "Starting to load {} samples to RAM...", + len(query_sample_indices), + ) + tic = time.perf_counter() for index in query_sample_indices: self.loaded_samples[index] = self.formulate_loaded_sample( self.dataset[index], use_guided_decoding=self.endpoint.use_guided_decoding, ) + logger.info( + "Loaded {} samples to RAM, which took {} seconds and {} GB in total.", + len(query_sample_indices), + time.perf_counter() - tic, + asizeof.asizeof(self.loaded_samples) / 1024 / 1024 / 1024, + ) def _unload_samples_from_ram(query_sample_indices: list[int]) -> None: """Called by LoadGen to unload samples from host memory after testing. @@ -239,9 +256,19 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None: query_sample_indices: The indices of the samples to unload from host memory. """ + logger.info( + "Starting to unload {} samples from RAM...", + len(query_sample_indices), + ) + tic = time.perf_counter() for index in query_sample_indices: sample_to_unload = self.loaded_samples.pop(index, None) del sample_to_unload + logger.info( + "Unloaded {} samples from RAM, which took {} seconds.", + len(query_sample_indices), + time.perf_counter() - tic, + ) return lg.ConstructQSL( self.total_num_samples, @@ -279,6 +306,17 @@ async def _query_endpoint_async_batch( if sample.response_format is not None else None ), + frequency_penalty=self.endpoint.sampling_params.frequency_penalty, + presence_penalty=self.endpoint.sampling_params.presence_penalty, + temperature=self.endpoint.sampling_params.temperature, + top_p=self.endpoint.sampling_params.top_p, + extra_body={ + "top_k": self.endpoint.sampling_params.top_k, + "min_p": self.endpoint.sampling_params.min_p, + "repetition_penalty": ( + self.endpoint.sampling_params.repetition_penalty + ), + }, ) logger.debug( "Received response (ID: {}) from endpoint after {} seconds.", @@ -360,6 +398,17 @@ async def _query_endpoint_async_stream( if sample.response_format is not None else None ), + frequency_penalty=self.endpoint.sampling_params.frequency_penalty, + presence_penalty=self.endpoint.sampling_params.presence_penalty, + temperature=self.endpoint.sampling_params.temperature, + top_p=self.endpoint.sampling_params.top_p, + extra_body={ + "top_k": self.endpoint.sampling_params.top_k, + "min_p": self.endpoint.sampling_params.min_p, + "repetition_penalty": ( + self.endpoint.sampling_params.repetition_penalty + ), + }, ) # iterate asynchronously total_tokens = 0 @@ -472,6 +521,10 @@ def _issue_queries(query_samples: list[lg.QuerySample]) -> None: def _flush_queries() -> None: """Called by the LoadGen to indicate that all queries have been issued.""" + logger.info( + "LoadGen has indicated that all queries have been issued. " + "Waiting for all pending queries to complete...", + ) async def _wait_for_pending_queries_async() -> None: """Wait for all pending queries to complete.""" @@ -494,6 +547,7 @@ async def _wait_for_pending_queries_async() -> None: self.event_loop, ) future.result() + logger.info("All pending queries has completed.") return lg.ConstructSUT(_issue_queries, _flush_queries) diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md deleted file mode 100644 index 5720fb7fd7..0000000000 --- a/multimodal/vl2l/README.md +++ /dev/null @@ -1,193 +0,0 @@ -# Reference Implementation for the Vision-language-to-language (VL2L) Benchmark - -## Quick Start - -This guide demonstrates how you can run the benchmark on your local machine. - -### Create a Conda environment - -Follow [this link](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions) -on how to install Miniconda on your host machine. Then, you can create a new conda -environment via: - -```bash -conda create -n mlperf-inf-mm-vl2l python=3.12 -``` - -### Install the VL2L benchmarking CLI - -#### For users - -Install `mlperf-inf-mm-vl2l` with: - -```bash -pip install git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/vl2l/ -``` - -#### For developers - -Clone the MLPerf Inference repo via: - -```bash -git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf-inference -``` - -Then enter the repo: - -```bash -cd mlperf-inference/ -``` - -Install `mlperf-inf-mm-vl2l` and the development tools with: - -- On Bash -```bash -pip install -e multimodal/vl2l/[dev] -``` -- On Zsh -```zsh -pip install -e multimodal/vl2l/"[dev]" -``` - -### Post VL2L benchmarking CLI installation - -After installation, you can check the CLI flags that `mlperf-inf-mm-vl2l` can take with: - -```bash -mlperf-inf-mm-vl2l --help -``` - -You can enable shell autocompletion for `mlperf-inf-mm-vl2l` with: - -```bash -mlperf-inf-mm-vl2l --install-completion -``` - -> [!NOTE] -> Shell auto-completion will take effect once you restart the terminal. - -### Start an inference endpoint on your local host machine with vLLM - -Please refer to [this guide on how to launch vLLM for various Qwen3 VL MoE models](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3-VL.html). - -```bash -docker run --gpus all \ # Use all the GPUs on this host machine. - -v ~/.cache/huggingface:/root/.cache/huggingface \ # Use the HuggingFace cache from your host machine. - -p 8000:8000 \ # This assumes the endpoint will use port 8000. - --ipc=host \ # The container can access and utilize the host's IPC mechanisms (e.g., shared memory). - vllm/vllm-openai:nightly \ # You can also use the `:latest` container or a specific release. - --model Qwen/Qwen3-VL-235B-A22B-Instruct \ # Specifies the model for vLLM to deploy. - --tensor-parallel-size 8 \ # 8-way tensor-parallel inference across 8 GPUs. - --limit-mm-per-prompt.video 0 # The input requests will contain images only (i.e., no videos). -``` - -### Run the benchmark for the Offline scenario - -Performance only mode: - -```bash -mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario offline --settings.test.mode performance_only -``` - -Accuracy only mode: - -```bash -mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario offline --settings.test.mode accuracy_only -``` - -### Run the benchmark for the Server scenario - -Performance only mode: - -```bash -mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario server --settings.test.mode performance_only -``` - -Accuracy only mode: - -```bash -mlperf-inf-mm-vl2l benchmark endpoint --settings.test.scenario server --settings.test.mode accuracy_only -``` - -### Evalute the response quality - -```bash -mlperf-inf-mm-vl2l evaluate --filename output/mlperf_log_accuracy.json -``` - -## Docker - -[docker/](docker/) provides examples of Dockerfiles that install the VL2L benchmarking -CLI into the container images of the inference engine. This is useful when you have to -run both the inference engine and the VL2L benchmarking CLI inside the same container, -for example, in a situation where you must use a GPU cluster managed by -[Slurm](https://slurm.schedmd.com/) with [enroot](https://github.com/nvidia/enroot) and -[pyxis](https://github.com/NVIDIA/pyxis). - -As an illustrative example, assuming that you are at the root directory of the MLPerf -Inference repo: - -1. You can build a container image against the vLLM's -`vllm/vllm-openai:v0.12.0` release by - -```bash -docker build \ - --build-arg BASE_IMAGE_URL=vllm/vllm-openai:v0.12.0 \ - --build-arg MLPERF_INF_MM_VL2L_INSTALL_URL=multimodal/vl2l \ - -f multimodal/vl2l/docker/vllm-cuda.Dockerfile \ - -t mlperf-inf-mm-vl2l:vllm-openai-v0.12.0 \ - . -``` -> [!NOTE] -> `MLPERF_INF_MM_VL2L_INSTALL_URL` can also take in a remote GitHub location, such as -> `git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/vl2l/`. - -2. Afterwards, you can start the container in the interactive mode by - -```bash -docker run --rm -it --gpus all -v ~/.cache:/root/.cache --ipc=host mlperf-inf-mm-vl2l:vllm-openai-v0.12.0 -``` - -### Benchmark against vLLM inside the container - -If you are running `mlperf-inf-mm-vl2l` inside a local environment that has access to -vLLM (such as inside a container that was created using the -[docker/vllm-cuda.Dockerfile](docker/vllm-cuda.Dockerfile)), you can use a single -`mlperf-inf-mm-vl2l benchmark vllm` command to achieve: - -1. Deploy an endpoint using vLLM. -2. Wait for the endpoint to be healthy. -3. Run the benchmark against that endpoint. - -For example, inside the container, you can run the Offline scenario Accuracy only -mode with: - -```bash -mlperf-inf-mm-vl2l benchmark vllm \ - --settings.test.scenario offline \ - --settings.test.mode accuracy_only \ - --dataset.token ... \ - --vllm.cli=--async-scheduling \ - --vllm.cli=--max-model-len=32768 \ - --vllm.cli=--max-num-seqs=1024 \ - --vllm.cli=--compilation-config='{ - "cudagraph_capture_sizes": [ - 1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, - 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, - 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, - 496, 512, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768 - ] - }' \ - --vllm.cli=--limit-mm-per-prompt.video=0 \ - --vllm.cli=--tensor-parallel-size=8 -``` - -## Developer Guide - -### Linting - -You can lint the VL2L benchmark source code by running the following script: - -```bash -bash multimodal/vl2l/scripts/linters.sh -``` \ No newline at end of file diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py deleted file mode 100644 index 5b325fff80..0000000000 --- a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/schema.py +++ /dev/null @@ -1,513 +0,0 @@ -"""Schema definitions of various data structures in the VL2L benchmark.""" - -from __future__ import annotations - -from datetime import timedelta -from enum import StrEnum, auto -from pathlib import Path -from typing import Annotated, ClassVar, Self - -import mlperf_loadgen as lg -from openai.types import ResponseFormatJSONSchema -from openai.types.chat import ChatCompletionMessageParam -from pydantic import ( - BaseModel, - ConfigDict, - DirectoryPath, - Field, - NonNegativeInt, - field_validator, - model_validator, -) - -MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES = 100 -ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES = 1 * 1024 * 1024 * 1024 # 1GB - - -class TestScenario(StrEnum): - """The test scenario for the MLPerf inference LoadGen.""" - - SERVER = auto() - """Run the benchmark in server/interactive scenario.""" - - OFFLINE = auto() - """Run the benchmark in offline/batch scenario.""" - - def to_lgtype(self) -> lg.TestScenario: - """Convert the test scenario to its corresponding LoadGen type.""" - match self: - case TestScenario.SERVER: - return lg.TestScenario.Server - case TestScenario.OFFLINE: - return lg.TestScenario.Offline - case _: - raise UnknownTestScenarioValueError(self) - - -class UnknownTestScenarioValueError(ValueError): - """The exception raised when an unknown test scenario is encountered.""" - - def __init__(self, test_scenario: TestScenario) -> None: - """Initialize the exception.""" - super().__init__(f"Unknown test scenario: {test_scenario}") - - -class TestMode(StrEnum): - """The test mode for the MLPerf inference LoadGen.""" - - PERFORMANCE_ONLY = auto() - """Run the benchmark to evaluate performance.""" - - ACCURACY_ONLY = auto() - """Run the benchmark to evaluate model quality.""" - - def to_lgtype(self) -> lg.TestMode: - """Convert the test mode to its corresponding LoadGen type.""" - match self: - case TestMode.PERFORMANCE_ONLY: - return lg.TestMode.PerformanceOnly - case TestMode.ACCURACY_ONLY: - return lg.TestMode.AccuracyOnly - case _: - raise UnknownTestModeValueError(self) - - -class UnknownTestModeValueError(ValueError): - """The exception raised when an unknown test mode is encountered.""" - - def __init__(self, test_mode: TestMode) -> None: - """Initialize the exception.""" - super().__init__(f"Unknown test mode: {test_mode}") - - -class LoggingMode(StrEnum): - """Specifies when logging should be sampled and stringified.""" - - ASYNC_POLL = auto() - """ Logs are serialized and output on an IOThread that polls for new logs - at a fixed interval. This is the only mode currently implemented.""" - - END_OF_TEST_ONLY = auto() - """ Not implemented """ - - SYNCHRONOUS = auto() - """ Not implemented """ - - def to_lgtype(self) -> lg.LoggingMode: - """Convert logging mode to its corresponding LoadGen type.""" - match self: - case LoggingMode.ASYNC_POLL: - return lg.LoggingMode.AsyncPoll - case _: - raise UnknownLoggingModeValueError(self) - - -class UnknownLoggingModeValueError(ValueError): - """The exception raised when an unknown logging mode is encountered.""" - - def __init__(self, logging_mode: LoggingMode) -> None: - """Initialize the exception.""" - super().__init__(f"Unknown logging mode: {logging_mode}") - - -class BaseModelWithAttributeDescriptionsFromDocstrings(BaseModel): - """Base model that automatically adds attribute descriptions from docstrings.""" - - model_config = ConfigDict(use_attribute_docstrings=True, extra="forbid") - """Pydantic settings for - - Automatically add the attribute descriptions from docstrings. - - Forbid extra attributes. - """ - - -_DEFAULT_DATASET_SIZE = 48289 -_DEFAULT_MIN_DURATION = timedelta(minutes=10) -_DEFAULT_OFFLINE_EXPECTED_QPS = ( - _DEFAULT_DATASET_SIZE / _DEFAULT_MIN_DURATION.total_seconds() -) - - -class TestSettings(BaseModelWithAttributeDescriptionsFromDocstrings): - """The test settings for the MLPerf inference LoadGen.""" - - scenario: TestScenario = TestScenario.OFFLINE - """The MLPerf inference benchmarking scenario to run the benchmark in.""" - - mode: TestMode = TestMode.PERFORMANCE_ONLY - """Whether you want to run the benchmark for performance measurement or accuracy - evaluation. - """ - - offline_expected_qps: float = _DEFAULT_OFFLINE_EXPECTED_QPS - """The expected QPS for the offline scenario.""" - - # sample_concatenate_permutation: bool = True # noqa: ERA001 - # """Affects the order in which the samples of the dataset are chosen. - # If `False`, it concatenates a single permutation of the dataset (or part - # of it depending on `performance_sample_count_override`) several times up to the - # number of samples requested. - # If `True`, it concatenates a multiple permutation of the dataset (or a - # part of it depending on `performance_sample_count_override`) several times - # up to the number of samples requested. - # """ - - server_expected_qps: float = 10 - """The expected QPS for the server scenario. Loadgen will try to send as many - request as necessary to achieve this value. - """ - - server_target_latency: timedelta = timedelta(seconds=1) - """Expected latency constraint for Server scenario. This is a constraint that we - expect depending on the argument server_expected_qps. When server_expected_qps - increases, we expect the latency to also increase. When server_expected_qps - decreases, we expect the latency to also decrease. - """ - - server_ttft_latency: timedelta = timedelta(seconds=1) - """Time to First Token (TTFT) latency constraint result validation (used when - use_token_latencies is enabled). - """ - - server_tpot_latency: timedelta = timedelta(seconds=1) - """Time per Output Token (TPOT) latency constraint result validation (used when - use_token_latencies is enabled). - """ - - min_duration: timedelta = _DEFAULT_MIN_DURATION - """The minimum testing duration (in seconds or ISO 8601 format like `PT5S`). The - benchmark runs until this value has been met. - """ - - min_query_count: int = _DEFAULT_DATASET_SIZE - """The minimum testing query count. The benchmark runs until this value has been - met. If min_query_count is less than the total number of samples in the dataset, - only the first min_query_count samples will be used during testing. - """ - - performance_sample_count_override: Annotated[ - NonNegativeInt, - Field( - description="The number of samples to use for the performance test. In the " # noqa: S608 - "performance mode, the benchmark will select P random samples from the " - "dataset, then send enough queries using these P samples (and repeating " - "them if necessary) to reach the min_duration and min_query_count. If a " - "non-zero value is passed to this flag, the P will be this value. " - "Otherwise, the benchmark will estimate how many samples can be loaded into" - f" {ALLOWED_MEMORY_FOOTPRINT_PERFORMANCE_SAMPLES} bytes of memory " - "based on the memory footprint of randomly selected " - f"{MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES} samples (at most), and then" - " use this estimation as the value P.", - ), - ] = _DEFAULT_DATASET_SIZE - - use_token_latencies: bool = False - """By default, the Server scenario will use `server_target_latency` as the - constraint. When set to True, the Server scenario will use `server_ttft_latency` and - `server_tpot_latency` as the constraint. - """ - - @field_validator( - "server_target_latency", - "server_ttft_latency", - "server_tpot_latency", - "min_duration", - mode="before", - ) - @classmethod - def parse_timedelta(cls, value: timedelta | float | - str) -> timedelta | str: - """Parse timedelta from seconds (int/float/str) or ISO 8601 format.""" - if isinstance(value, timedelta): - return value - if isinstance(value, (int, float)): - return timedelta(seconds=value) - if isinstance(value, str): - # Try to parse as a number first - try: - return timedelta(seconds=float(value)) - except ValueError: - # If it fails, it might be ISO 8601 format - # Let pydantic's default parser handle it - pass - return value - - def to_lgtype(self) -> lg.TestSettings: - """Convert the test settings to its corresponding LoadGen type.""" - settings = lg.TestSettings() - settings.scenario = self.scenario.to_lgtype() - settings.mode = self.mode.to_lgtype() - settings.offline_expected_qps = self.offline_expected_qps - settings.server_target_qps = self.server_expected_qps - settings.server_target_latency_ns = round( - self.server_target_latency.total_seconds() * 1e9, - ) - settings.ttft_latency = round( - self.server_ttft_latency.total_seconds() * 1e9) - settings.tpot_latency = round( - self.server_tpot_latency.total_seconds() * 1e9) - settings.min_duration_ms = round( - self.min_duration.total_seconds() * 1000) - settings.min_query_count = self.min_query_count - settings.performance_sample_count_override = ( - self.performance_sample_count_override - ) - settings.use_token_latencies = self.use_token_latencies - return settings - - -class LogOutputSettings(BaseModelWithAttributeDescriptionsFromDocstrings): - """The test log output settings for the MLPerf inference LoadGen.""" - - outdir: DirectoryPath = DirectoryPath("./output") - """Where to save the output files from the benchmark.""" - - prefix: str = "mlperf_log_" - """Modify the filenames of the logs with a prefix.""" - - suffix: str = "" - """Modify the filenames of the logs with a suffix.""" - - prefix_with_datetime: bool = False - """Modify the filenames of the logs with a datetime.""" - - copy_detail_to_stdout: bool = False - """Print details of performance test to stdout.""" - - copy_summary_to_stdout: bool = True - """Print results of performance test to terminal.""" - - @field_validator("outdir", mode="before") - @classmethod - def parse_directory_field(cls, value: str) -> Path: - """Verify and create the output directory to store log files.""" - path = Path(value) - path.mkdir(exist_ok=True) - return path - - def to_lgtype(self) -> lg.LogOutputSettings: - """Convert the log output settings to its corresponding LoadGen type.""" - log_output_settings = lg.LogOutputSettings() - log_output_settings.outdir = self.outdir.as_posix() - log_output_settings.prefix = self.prefix - log_output_settings.suffix = self.suffix - log_output_settings.prefix_with_datetime = self.prefix_with_datetime - log_output_settings.copy_detail_to_stdout = self.copy_detail_to_stdout - log_output_settings.copy_summary_to_stdout = self.copy_summary_to_stdout - return log_output_settings - - -class LogSettings(BaseModelWithAttributeDescriptionsFromDocstrings): - """The test log settings for the MLPerf inference LoadGen.""" - - log_output: LogOutputSettings = LogOutputSettings() - """Log output settings""" - - log_mode: LoggingMode = LoggingMode.ASYNC_POLL - """How and when logging should be sampled and stringified at runtime""" - - enable_trace: bool = True - """Enable trace""" - - def to_lgtype(self) -> lg.LogSettings: - """Convert log settings to its corresponding LoadGen type.""" - log_settings = lg.LogSettings() - log_settings.log_output = self.log_output.to_lgtype() - log_settings.log_mode = self.log_mode.to_lgtype() - log_settings.enable_trace = self.enable_trace - return log_settings - - -class Settings(BaseModelWithAttributeDescriptionsFromDocstrings): - """Combine the settings for the test and logging of LoadGen.""" - - test: TestSettings - """Test settings parameters.""" - - logging: LogSettings - """Test logging parameters.""" - - def to_lgtype(self) -> tuple[lg.TestSettings, lg.LogSettings]: - """Return test and log settings for LoadGen.""" - test_settings = self.test.to_lgtype() - log_settings = self.logging.to_lgtype() - return (test_settings, log_settings) - - -class Model(BaseModelWithAttributeDescriptionsFromDocstrings): - """Specifies the model to use for the VL2L benchmark.""" - - repo_id: str = "Qwen/Qwen3-VL-235B-A22B-Instruct" - """The HuggingFace repository ID of the model.""" - - -class Dataset(BaseModelWithAttributeDescriptionsFromDocstrings): - """Specifies a dataset on HuggingFace.""" - - repo_id: str = "Shopify/the-catalogue-public-beta" - """The HuggingFace repository ID of the dataset.""" - - token: str | None = None - """The token to access the HuggingFace repository of the dataset.""" - - revision: str | None = None - """The revision of the dataset. If not provided, the default revision (i.e., usually - `main`) will be used. - """ - - split: list[str] = ["train", "test"] - """Dataset splits to use for the benchmark, e.g., "train" and "test". You can add - multiple splits by repeating the same CLI flag multiple times, e.g.: - --dataset.split test --dataset.split train - The testing dataset is a concatenation of these splits in the same order. - """ - - -class Verbosity(StrEnum): - """The verbosity level of the logger.""" - - TRACE = auto() - """The trace verbosity level.""" - - DEBUG = auto() - """The debug verbosity level.""" - - INFO = auto() - """The info verbosity level (default).""" - - -class Endpoint(BaseModelWithAttributeDescriptionsFromDocstrings): - """Specifies the OpenAI API endpoint to use for the VL2L benchmark.""" - - url: str = "http://localhost:8000/v1" - """The URL of the OpenAI API endpoint that the inference requests are sent to.""" - - api_key: str = "" - """The API key to authenticate the inference requests.""" - - model: Model - """The model to use for the VL2L benchmark, i.e., the model that was deployed behind - this OpenAI API endpoint. - """ - - use_guided_decoding: bool = False - """If True, the benchmark will enable guided decoding for the requests. This - requires the endpoint (and the inference engine behind it) to support guided - decoding. If False, the response from the endpoint might not be directly parsable - by the response JSON schema (e.g., the JSON object might be fenced in a - ```json ... ``` code block). - """ - - request_timeout: timedelta = timedelta(hours=2) - """The timeout for the inference request to the endpoint. The default value for - OpenAI API client is 10 minutes - (https://github.com/openai/openai-python?tab=readme-ov-file#timeouts) which might - not be sufficient for the offline scenario. - """ - - -class EndpointToDeploy(Endpoint): - """Specifies the endpoint to deploy for the VL2L benchmark.""" - - startup_timeout: timedelta = timedelta(minutes=20) - """The timeout for the endpoint to start up.""" - - shutdown_timeout: timedelta = timedelta(minutes=1) - """The timeout for the endpoint to shut down.""" - - poll_interval: timedelta = timedelta(seconds=60) - """The interval to poll the endpoint for readiness.""" - - healthcheck_timeout: timedelta = timedelta(seconds=5) - """The timeout for the healthcheck request to the endpoint.""" - - -class VllmEndpoint(EndpointToDeploy): - """Specifies how to deploy an OpenAI API endpoint in vLLM for benchmarking.""" - - cli: list[str] = [] - """The CLI arguments to pass to `vllm serve`. This excludes vllm's `--host`, - `--port`, --api-key` and `--model` CLI arguments which will be determined by - the `url`, `api_key` and `model` fields of this schema.""" - - @model_validator(mode="after") - def validate_cli(self) -> Self: - """Validate the vllm CLI arguments.""" - for flag in self.cli: - if not flag.startswith(("--", "-")): - raise PositionalVllmCliFlagError(flag) - if flag.split("=", 1)[0] in BlacklistedVllmCliFlagError.BLACKLIST: - raise BlacklistedVllmCliFlagError(flag) - return self - - -class PositionalVllmCliFlagError(ValueError): - """The exception raised when a positional vllm CLI flag is encountered.""" - - def __init__(self, flag: str) -> None: - """Initialize the exception.""" - super().__init__( - f"Positional vllm CLI flag: {flag} is not allowed. Only optional flags are " - "allowed to be passed to `--vllm.cli`.", - ) - - -class BlacklistedVllmCliFlagError(ValueError): - """The exception raised when a blacklisted vllm CLI flag is encountered.""" - - BLACKLIST: ClassVar[list[str]] = [ - "--model", "--host", "--port", "--api-key"] - - def __init__(self, flag: str) -> None: - """Initialize the exception.""" - super().__init__( - f"Blacklisted vllm CLI flag: {flag} is not allowed. The blacklisted flags" - f"are {self.BLACKLIST}.", - ) - - -class ProductMetadata(BaseModelWithAttributeDescriptionsFromDocstrings): - """Json format for the expected responses from the VLM.""" - - category: str - """The complete category of the product, e.g., - "Clothing & Accessories > Clothing > Shirts > Polo Shirts". - Each categorical level is separated by " > ". - """ - - brand: str - """The brand of the product, e.g., "giorgio armani".""" - - is_secondhand: bool - """True if the product is second-hand, False otherwise.""" - - -class LoadedSample(BaseModelWithAttributeDescriptionsFromDocstrings): - """Sample format to be used by LoadGen.""" - - messages: list[ChatCompletionMessageParam] - """The messages to be sent for chat completion to the VLM inference endpoint.""" - - response_format: ResponseFormatJSONSchema | None = None - """The response format to be used during guided decoding.""" - - @field_validator("messages", mode="after") - @classmethod - def ensure_content_is_list( - cls, - messages: list[ChatCompletionMessageParam], - ) -> list[ChatCompletionMessageParam]: - """If the content is a `ValidatorIterator`, convert it back to a list. - - This is to workaround a Pydantic bug. See - https://github.com/pydantic/pydantic/issues/9467 for more details. - """ - for message in messages: - if ( - "content" in message - and message["content"].__class__.__module__ - == "pydantic_core._pydantic_core" - and message["content"].__class__.__name__ == "ValidatorIterator" - ): - message["content"] = list( - message["content"]) # type: ignore[arg-type] - return messages