diff --git a/src/fed_rag/base/evals/benchmark.py b/src/fed_rag/base/evals/benchmark.py
index 9cab11a9..3793fbee 100644
--- a/src/fed_rag/base/evals/benchmark.py
+++ b/src/fed_rag/base/evals/benchmark.py
@@ -10,25 +10,59 @@
 
 
 class BaseBenchmark(BaseModel, ABC):
-    """Base Benchmark."""
+    """Base class for implementing benchmarks.
+
+    This abstract class defines the interface for benchmark datasets,
+    providing methods to access examples, iterate over them, and stream
+    them lazily. Subclasses must implement how examples are retrieved
+    and how many examples exist.
+    """
 
     _examples: Sequence[BenchmarkExample] = PrivateAttr()
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
-    # give it a sequence interface for accessing examples more easily
     def __getitem__(self, index: int) -> BenchmarkExample:
+        """Return a benchmark example at the specified index.
+
+        Args:
+            index (int): The position of the example in the sequence.
+
+        Returns:
+            BenchmarkExample: The benchmark example at the given index.
+        """
         return self._examples.__getitem__(index)
 
     def __len__(self) -> int:
+        """Return the number of loaded examples.
+
+        Returns:
+            int: The number of examples currently loaded into memory.
+        """
         return self._examples.__len__()
 
-    # shouldn't override Pydantic BaseModels' __iter__
     def as_iterator(self) -> Iterator[BenchmarkExample]:
+        """Return an iterator over the loaded examples.
+
+        Note:
+            This uses the in-memory examples. For large datasets that
+            cannot fit into memory, use :meth:`as_stream` instead.
+
+        Returns:
+            Iterator[BenchmarkExample]: Iterator over benchmark examples.
+        """
         return self._examples.__iter__()
 
     @model_validator(mode="after")
     def set_examples(self) -> "BaseBenchmark":
+        """Populate the benchmark with examples after initialization.
+
+        Returns:
+            BaseBenchmark: The instance with examples set.
+
+        Raises:
+            BenchmarkGetExamplesError: If retrieving or parsing examples fails.
+        """
         try:
             self._examples = self._get_examples()
         except BenchmarkParseError as e:
@@ -36,25 +70,48 @@ def set_examples(self) -> "BaseBenchmark":
                 f"Failed to parse examples: {str(e)}"
             ) from e
         except Exception as e:
-            raise (
-                BenchmarkGetExamplesError(f"Failed to get examples: {str(e)}")
+            raise BenchmarkGetExamplesError(
+                f"Failed to get examples: {str(e)}"
             ) from e
         return self
 
-    # abstractmethods
     @abstractmethod
     def _get_examples(self, **kwargs: Any) -> Sequence[BenchmarkExample]:
-        """Method to get examples."""
+        """Fetch and return all benchmark examples.
+
+        Args:
+            **kwargs (Any): Optional arguments for retrieving examples.
+
+        Returns:
+            Sequence[BenchmarkExample]: A sequence of benchmark examples.
+
+        Raises:
+            BenchmarkParseError: If parsing examples fails.
+        """
+        ...
 
     @abstractmethod
     def as_stream(self) -> Generator[BenchmarkExample, None, None]:
-        """Produce a stream of `BenchmarkExamples`."""
+        """Stream benchmark examples one by one.
+
+        This method is useful for very large datasets that cannot be
+        stored entirely in memory.
+
+        Yields:
+            Generator[BenchmarkExample, None, None]: Benchmark examples.
+        """
+        ...
 
     @property
     @abstractmethod
     def num_examples(self) -> int:
-        """Number of examples in the benchmark.
+        """Return the total number of examples in the benchmark.
+
+        Note:
+            If streaming is used, `_examples` may be an empty list. In such
+            cases, subclasses should implement their own logic for counting.
 
-        NOTE: if streaming, `_examples` is likely set to an empty list. Thus,
-        we leave this implementation for the subclasses.
+        Returns:
+            int: Total number of examples.
         """
+        ...
diff --git a/src/fed_rag/base/evals/metric.py b/src/fed_rag/base/evals/metric.py
index 2527b56d..d1cd965f 100644
--- a/src/fed_rag/base/evals/metric.py
+++ b/src/fed_rag/base/evals/metric.py
@@ -7,10 +7,33 @@
 
 
 class BaseEvaluationMetric(BaseModel, ABC):
-    """Base Data Collator."""
+    """Base class for evaluation metrics.
+
+    This abstract class defines the interface for evaluation metrics that
+    compare a model's prediction against the expected ground truth. Subclasses
+    must implement the :meth:`__call__` method, which makes instances of the
+    metric callable like a function.
+
+    """
 
     @abstractmethod
     def __call__(
         self, prediction: str, actual: str, *args: Any, **kwargs: Any
     ) -> float:
-        """Evaluate an example prediction against the actual response."""
+        """Evaluate a prediction against the actual response.
+
+        Args:
+            prediction (str): The model's predicted output.
+            actual (str): The ground-truth or expected output.
+            *args (Any): Optional positional arguments for customization.
+            **kwargs (Any): Optional keyword arguments for customization.
+
+        Returns:
+            float: A numerical score representing how well the prediction
+            matches the actual output. The interpretation of the score
+            depends on the specific metric implementation.
+
+        Raises:
+            NotImplementedError: If the subclass does not implement this method.
+        """
+        ...