mlcommons
diff --git a/‎recommendation/dlrm_v3/accuracy.py‎
Lines changed: 7 additions & 0 deletions b/‎recommendation/dlrm_v3/accuracy.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎recommendation/dlrm_v3/checkpoint.py‎
Lines changed: 56 additions & 0 deletions b/‎recommendation/dlrm_v3/checkpoint.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎recommendation/dlrm_v3/configs.py‎
Lines changed: 30 additions & 0 deletions b/‎recommendation/dlrm_v3/configs.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎recommendation/dlrm_v3/data_producer.py‎
Lines changed: 71 additions & 1 deletion b/‎recommendation/dlrm_v3/data_producer.py‎
Lines changed: 71 additions & 1 deletion
@@ -42,6 +42,13 @@ def get_args() -> argparse.Namespace:
 
 
 def main() -> None:
+    """
+    Main function to calculate accuracy metrics from loadgen output.
+
+    Reads the mlperf_log_accuracy.json file, parses the results, and computes
+    accuracy metrics using the MetricsLogger. Each result entry contains
+    predictions, labels, and weights packed as float32 numpy arrays.
+    """
     args = get_args()
     logger.warning("Parsing loadgen accuracy log...")
     with open(args.path, "r") as f:
 
@@ -13,6 +13,12 @@
 # limitations under the License.
 
 # pyre-strict
+"""
+Checkpoint utilities for saving and loading DLRMv3 model checkpoints.
+
+This module provides functions for saving and loading distributed model checkpoints,
+including both sparse (embedding) and dense (non-embedding) components.
+"""
 
 import gc
 import os
@@ -29,6 +35,17 @@
 
 
 class SparseState(Stateful):
+    """
+    Stateful wrapper for sparse (embedding) tensors in a model.
+
+    This class implements the Stateful interface for distributed checkpointing,
+    allowing sparse tensors to be saved and loaded separately from dense tensors.
+
+    Args:
+        model: The PyTorch model containing sparse tensors.
+        sparse_tensor_keys: Set of keys identifying sparse tensors in the model's state dict.
+    """
+
     def __init__(self, model: torch.nn.Module, sparse_tensor_keys: Set[str]) -> None:
         self.model = model
         self.sparse_tensor_keys = sparse_tensor_keys
@@ -79,6 +96,20 @@ def save_dmp_checkpoint(
     batch_idx: int,
     path: str = "",
 ) -> None:
+    """
+    Save a distributed model checkpoint including sparse and dense components.
+
+    Saves the model's sparse tensors using distributed checkpointing and dense
+    tensors, optimizer state, and metrics using standard PyTorch serialization.
+
+    Args:
+        model: The model to checkpoint.
+        optimizer: The optimizer whose state should be saved.
+        metric_logger: The metrics logger containing training/eval metrics.
+        rank: The current process rank in distributed training.
+        batch_idx: The current batch index (used for checkpoint naming).
+        path: Base path for saving the checkpoint. If empty, no checkpoint is saved.
+    """
     if path == "":
         return
     now = datetime.now()
@@ -161,6 +192,18 @@ def load_nonsparse_checkpoint(
     metric_logger: Optional[MetricsLogger] = None,
     path: str = "",
 ) -> None:
+    """
+    Load non-sparse (dense) components from a checkpoint.
+
+    Loads dense model parameters, and optionally optimizer state and metrics.
+
+    Args:
+        model: The model to load dense parameters into.
+        device: The device to load tensors onto.
+        optimizer: Optional optimizer to restore state for.
+        metric_logger: Optional metrics logger to restore state for.
+        path: Base path of the checkpoint. If empty, no loading is performed.
+    """
     if path == "":
         return
     non_sparse_ckpt = f"{path}/non_sparse.ckpt"
@@ -193,6 +236,19 @@ def load_dmp_checkpoint(
     device: torch.device,
     path: str = "",
 ) -> None:
+    """
+    Load a complete distributed model checkpoint (both sparse and dense components).
+
+    This is a convenience function that calls both load_sparse_checkpoint and
+    load_nonsparse_checkpoint.
+
+    Args:
+        model: The model to load the checkpoint into.
+        optimizer: The optimizer to restore state for.
+        metric_logger: The metrics logger to restore state for.
+        device: The device to load tensors onto.
+        path: Base path of the checkpoint. If empty, no loading is performed.
+    """
     load_sparse_checkpoint(model=model, path=path)
     load_nonsparse_checkpoint(
         model=model,
 
@@ -13,6 +13,11 @@
 # limitations under the License.
 
 # pyre-strict
+"""
+Configuration module for DLRMv3 model.
+
+This module provides configuration functions for the HSTU model architecture and embedding table configurations.
+"""
 from typing import Dict
 
 from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
@@ -27,6 +32,19 @@
 
 
 def get_hstu_configs(dataset: str = "debug") -> DlrmHSTUConfig:
+    """
+    Create and return HSTU model configuration.
+
+    Builds a complete DlrmHSTUConfig with default hyperparameters for the HSTU
+    architecture including attention settings, embedding dimensions, dropout rates,
+    and feature name mappings.
+
+    Args:
+        dataset: Dataset identifier (currently unused, reserved for dataset-specific configs).
+
+    Returns:
+        DlrmHSTUConfig: Complete configuration object for the HSTU model.
+    """
     hstu_config = DlrmHSTUConfig(
         hstu_num_heads=4,
         hstu_attn_linear_dim=128,
@@ -97,6 +115,18 @@ def get_hstu_configs(dataset: str = "debug") -> DlrmHSTUConfig:
 
 
 def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingConfig]:
+    """
+    Create and return embedding table configurations.
+
+    Defines the embedding table configurations for item IDs, category IDs, and user IDs
+    with their respective dimensions and data types.
+
+    Args:
+        dataset: Dataset identifier (currently unused, reserved for dataset-specific configs).
+
+    Returns:
+        Dict mapping table names to their EmbeddingConfig objects.
+    """
     return {
         "item_id": EmbeddingConfig(
             num_embeddings=HASH_SIZE,
 
@@ -13,6 +13,12 @@
 # limitations under the License.
 
 # pyre-strict
+"""
+Data producer module for DLRMv3 inference.
+
+This module provides classes for producing and managing query data during inference,
+supporting both single-threaded and multi-threaded data production modes.
+"""
 
 import logging
 import threading
@@ -28,7 +34,16 @@
 
 
 class QueryItem:
-    """An item that we queue for processing by the thread pool."""
+    """
+    Container for a query item to be processed by the inference thread pool.
+
+    Attributes:
+        query_ids: List of unique identifiers for the queries in this batch.
+        samples: The sample data containing features for the queries.
+        start: Time when the query was first received.
+        dt_queue: Time spent in the queue before processing.
+        dt_batching: Time spent on batching the data.
+    """
 
     def __init__(
         self,
@@ -46,13 +61,33 @@ def __init__(
 
 
 class SingleThreadDataProducer:
+    """
+    Single-threaded data producer for synchronous query processing.
+
+    This producer processes queries on the main thread without any parallelism,
+    suitable for debugging or low-throughput scenarios.
+
+    Args:
+        ds: The dataset to fetch samples from.
+        run_one_item: Callback function to process a single QueryItem.
+    """
+
     def __init__(self, ds: Dataset, run_one_item) -> None:  # pyre-ignore [2]
         self.ds = ds
         self.run_one_item = run_one_item  # pyre-ignore [4]
 
     def enqueue(
         self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float
     ) -> None:
+        """
+        Enqueue queries for immediate synchronous processing.
+
+        Args:
+            query_ids: List of unique query identifiers.
+            content_ids: List of content/sample identifiers to fetch.
+            t0: Timestamp when the query batch was created.
+            dt_queue: Time spent waiting in the queue.
+        """
         with torch.profiler.record_function("data batching"):
             t0_batching: float = time.time()
             samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids)
@@ -81,10 +116,23 @@ def enqueue(
                     self.run_one_item(query)
 
     def finish(self) -> None:
+        """Finalize the producer. No-op for single-threaded mode."""
         pass
 
 
 class MultiThreadDataProducer:
+    """
+    Multi-threaded data producer for parallel query processing.
+
+    Uses a thread pool to fetch and batch data in parallel with model inference,
+    improving throughput for high-load scenarios.
+
+    Args:
+        ds: The dataset to fetch samples from.
+        threads: Number of worker threads to use.
+        run_one_item: Callback function to process a single QueryItem.
+    """
+
     def __init__(
         self,
         ds: Dataset,
@@ -108,6 +156,14 @@ def __init__(
     def handle_tasks(
         self, tasks_queue: Queue[Optional[Tuple[List[int], List[int], float, float]]]
     ) -> None:
+        """
+        Worker thread main loop to process tasks from the queue.
+
+        Each worker maintains its own CUDA stream for parallel execution.
+
+        Args:
+            tasks_queue: Queue containing task tuples or None for termination.
+        """
         stream = torch.cuda.Stream()
         while True:
             query_and_content_ids = tasks_queue.get()
@@ -147,10 +203,24 @@ def handle_tasks(
     def enqueue(
         self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float
     ) -> None:
+        """
+        Enqueue queries for asynchronous processing by worker threads.
+
+        Args:
+            query_ids: List of unique query identifiers.
+            content_ids: List of content/sample identifiers to fetch.
+            t0: Timestamp when the query batch was created.
+            dt_queue: Time spent waiting in the queue.
+        """
         with torch.profiler.record_function("data batching"):
             self.tasks.put((query_ids, content_ids, t0, dt_queue))
 
     def finish(self) -> None:
+        """
+        Signal all worker threads to terminate and wait for completion.
+
+        Sends None to each worker to trigger graceful shutdown.
+        """
         for _ in self.workers:
             self.tasks.put(None)
         for worker in self.workers: