mlcommons · LinjianMa · Dec 16, 2025 · Dec 17, 2025 · Dec 18, 2025 · Dec 19, 2025
@@ -77,6 +77,7 @@ retinanet.Server.target_latency = 100
 bert.Server.target_latency = 130
 dlrm.Server.target_latency = 60
 dlrm-v2.Server.target_latency = 60
+dlrm-v3.Server.target_latency = 80
 rnnt.Server.target_latency = 1000
 gptj.Server.target_latency = 20000
 stable-diffusion-xl.Server.target_latency = 20000

@@ -0,0 +1,90 @@
+# MLPerf Inference reference implementation for DLRMv3
+
+## Install dependencies and build loadgen
+
+The reference implementation has been tested on a single host, with x86_64 CPUs and 8 NVIDIA H100/B200 GPUs. Dependencies can be installed below,
+```
+sh setup.sh
+```
+
+## Dataset download
+
+DLRMv3 uses a synthetic dataset specifically designed to match the model and system characteristics of large-scale sequential recommendation (large item set and long average sequence length for each request). To generate the dataset used for both training and inference, run
+```
+python streaming_synthetic_data.py
+```
+The generated dataset has 2TB size, and contains 5 million users interacting with a billion items over 100 timestamps.
+
+Only 1% of the dataset is used in the inference benchmark. The sampled DLRMv3 dataset and trained checkpoint are available at https://inference.mlcommons-storage.org/.
+
+Script to download the sampled dataset used in inference benchmark:
+```
+bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-dataset.uri
+```
+Script to download the 1TB trained checkpoint:
+```
+bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-checkpoint.uri
+```
+
+## Inference benchmark
+
+```
+WORLD_SIZE=8 python main.py --dataset sampled-streaming-100b
+```
+
+`WORLD_SIZE` is the number of GPUs used in the inference benchmark.
+
+```
+usage: main.py [-h] [--dataset {streaming-100b,sampled-streaming-100b}] [--model-path MODEL_PATH] [--scenario-name {Server,Offline}] [--batchsize BATCHSIZE]
+               [--output-trace OUTPUT_TRACE] [--data-producer-threads DATA_PRODUCER_THREADS] [--compute-eval COMPUTE_EVAL] [--find-peak-performance FIND_PEAK_PERFORMANCE]
+               [--dataset-path-prefix DATASET_PATH_PREFIX] [--warmup-ratio WARMUP_RATIO] [--num-queries NUM_QUERIES] [--target-qps TARGET_QPS] [--numpy-rand-seed NUMPY_RAND_SEED]
+               [--sparse-quant SPARSE_QUANT] [--dataset-percentage DATASET_PERCENTAGE]
+
+options:
+  -h, --help            show this help message and exit
+  --dataset {streaming-100b,sampled-streaming-100b}
+                        name of the dataset
+  --model-path MODEL_PATH
+                        path to the model checkpoint. Example: /home/username/ckpts/streaming_100b/89/
+  --scenario-name {Server,Offline}
+                        inference benchmark scenario
+  --batchsize BATCHSIZE
+                        batch size used in the benchmark
+  --output-trace OUTPUT_TRACE
+                        Whether to output trace
+  --data-producer-threads DATA_PRODUCER_THREADS
+                        Number of threads used in data producer
+  --compute-eval COMPUTE_EVAL
+                        If true, will run AccuracyOnly mode and outputs both predictions and labels for accuracy calcuations
+  --find-peak-performance FIND_PEAK_PERFORMANCE
+                        Whether to find peak performance in the benchmark
+  --dataset-path-prefix DATASET_PATH_PREFIX
+                        Prefix to the dataset path. Example: /home/username/
+  --warmup-ratio WARMUP_RATIO
+                        The ratio of the dataset used to warmup SUT
+  --num-queries NUM_QUERIES
+                        Number of queries to run in the benchmark
+  --target-qps TARGET_QPS
+                        Benchmark target QPS. Needs to be tuned for different implementations to balance latency and throughput
+  --numpy-rand-seed NUMPY_RAND_SEED
+                        Numpy random seed
+  --sparse-quant SPARSE_QUANT
+                        Whether to quantize sparse arch
+  --dataset-percentage DATASET_PERCENTAGE
+                        Percentage of the dataset to run in the benchmark
+```
+
+## Accuracy test
+
+Set `run.compute_eval` will run the accuracy test and dump prediction outputs in
+`mlperf_log_accuracy.json`. To check the accuracy, run
+
+```
+python accuracy.py --path path/to/mlperf_log_accuracy.json
+```
+We use normalized entropy (NE), accuracy, and AUC as the metrics to evaluate the model quality. For accepted submissions, all three metrics (NE, Accuracy, AUC) must be within 99% of the reference implementation values. The accuracy for the reference implementation evaluated on 34,996 requests across 10 inference timestamps are listed below:
+```
+NE: 86.687%
+Accuracy: 69.651%
+AUC: 78.663%
+```
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Tool to calculate accuracy for loadgen accuracy output found in mlperf_log_accuracy.json
+"""
+
+import argparse
+import json
+import logging
+
+import numpy as np
+import torch
+from configs import get_hstu_configs
+from utils import MetricsLogger
+
+logger: logging.Logger = logging.getLogger("main")
+
+
+def get_args() -> argparse.Namespace:
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--path",
+        required=True,
+        help="path to mlperf_log_accuracy.json",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main() -> None:
+    """
+    Main function to calculate accuracy metrics from loadgen output.
+
+    Reads the mlperf_log_accuracy.json file, parses the results, and computes
+    accuracy metrics using the MetricsLogger. Each result entry contains
+    predictions, labels, and weights packed as float32 numpy arrays.
+    """
+    args = get_args()
+    logger.warning("Parsing loadgen accuracy log...")
+    with open(args.path, "r") as f:
+        results = json.load(f)
+    hstu_config = get_hstu_configs(dataset="sampled-streaming-100b")
+    metrics = MetricsLogger(
+        multitask_configs=hstu_config.multitask_configs,
+        batch_size=1,
+        window_size=3000,
+        device=torch.device("cpu"),
+        rank=0,
+    )
+    logger.warning(f"results have {len(results)} entries")
+    for result in results:
+        data = np.frombuffer(bytes.fromhex(result["data"]), np.float32)
+        num_candidates = data[-1].astype(int)
+        assert len(data) == 1 + num_candidates * 3
+        mt_target_preds = torch.from_numpy(data[0:num_candidates])
+        mt_target_labels = torch.from_numpy(data[num_candidates : num_candidates * 2])
+        mt_target_weights = torch.from_numpy(
+            data[num_candidates * 2 : num_candidates * 3]
+        )
+        num_candidates = torch.tensor([num_candidates])
+        metrics.update(
+            predictions=mt_target_preds.view(1, -1),
+            labels=mt_target_labels.view(1, -1),
+            weights=mt_target_weights.view(1, -1),
+            num_candidates=num_candidates,
+        )
+    for k, v in metrics.compute().items():
+        logger.warning(f"{k}: {v}")
+
+
+if __name__ == "__main__":
+    main()