diff --git a/src/fed_rag/base/evals/benchmark.py b/src/fed_rag/base/evals/benchmark.py index 9cab11a9..3793fbee 100644 --- a/src/fed_rag/base/evals/benchmark.py +++ b/src/fed_rag/base/evals/benchmark.py @@ -10,25 +10,59 @@ class BaseBenchmark(BaseModel, ABC): - """Base Benchmark.""" + """Base class for implementing benchmarks. + + This abstract class defines the interface for benchmark datasets, + providing methods to access examples, iterate over them, and stream + them lazily. Subclasses must implement how examples are retrieved + and how many examples exist. + """ _examples: Sequence[BenchmarkExample] = PrivateAttr() model_config = ConfigDict(arbitrary_types_allowed=True) - # give it a sequence interface for accessing examples more easily def __getitem__(self, index: int) -> BenchmarkExample: + """Return a benchmark example at the specified index. + + Args: + index (int): The position of the example in the sequence. + + Returns: + BenchmarkExample: The benchmark example at the given index. + """ return self._examples.__getitem__(index) def __len__(self) -> int: + """Return the number of loaded examples. + + Returns: + int: The number of examples currently loaded into memory. + """ return self._examples.__len__() - # shouldn't override Pydantic BaseModels' __iter__ def as_iterator(self) -> Iterator[BenchmarkExample]: + """Return an iterator over the loaded examples. + + Note: + This uses the in-memory examples. For large datasets that + cannot fit into memory, use :meth:`as_stream` instead. + + Returns: + Iterator[BenchmarkExample]: Iterator over benchmark examples. + """ return self._examples.__iter__() @model_validator(mode="after") def set_examples(self) -> "BaseBenchmark": + """Populate the benchmark with examples after initialization. + + Returns: + BaseBenchmark: The instance with examples set. + + Raises: + BenchmarkGetExamplesError: If retrieving or parsing examples fails. + """ try: self._examples = self._get_examples() except BenchmarkParseError as e: @@ -36,25 +70,48 @@ def set_examples(self) -> "BaseBenchmark": f"Failed to parse examples: {str(e)}" ) from e except Exception as e: - raise ( - BenchmarkGetExamplesError(f"Failed to get examples: {str(e)}") + raise BenchmarkGetExamplesError( + f"Failed to get examples: {str(e)}" ) from e return self - # abstractmethods @abstractmethod def _get_examples(self, **kwargs: Any) -> Sequence[BenchmarkExample]: - """Method to get examples.""" + """Fetch and return all benchmark examples. + + Args: + **kwargs (Any): Optional arguments for retrieving examples. + + Returns: + Sequence[BenchmarkExample]: A sequence of benchmark examples. + + Raises: + BenchmarkParseError: If parsing examples fails. + """ + ... @abstractmethod def as_stream(self) -> Generator[BenchmarkExample, None, None]: - """Produce a stream of `BenchmarkExamples`.""" + """Stream benchmark examples one by one. + + This method is useful for very large datasets that cannot be + stored entirely in memory. + + Yields: + Generator[BenchmarkExample, None, None]: Benchmark examples. + """ + ... @property @abstractmethod def num_examples(self) -> int: - """Number of examples in the benchmark. + """Return the total number of examples in the benchmark. + + Note: + If streaming is used, `_examples` may be an empty list. In such + cases, subclasses should implement their own logic for counting. - NOTE: if streaming, `_examples` is likely set to an empty list. Thus, - we leave this implementation for the subclasses. + Returns: + int: Total number of examples. """ + ... diff --git a/src/fed_rag/base/evals/metric.py b/src/fed_rag/base/evals/metric.py index 2527b56d..d1cd965f 100644 --- a/src/fed_rag/base/evals/metric.py +++ b/src/fed_rag/base/evals/metric.py @@ -7,10 +7,33 @@ class BaseEvaluationMetric(BaseModel, ABC): - """Base Data Collator.""" + """Base class for evaluation metrics. + + This abstract class defines the interface for evaluation metrics that + compare a model's prediction against the expected ground truth. Subclasses + must implement the :meth:`__call__` method, which makes instances of the + metric callable like a function. + + """ @abstractmethod def __call__( self, prediction: str, actual: str, *args: Any, **kwargs: Any ) -> float: - """Evaluate an example prediction against the actual response.""" + """Evaluate a prediction against the actual response. + + Args: + prediction (str): The model's predicted output. + actual (str): The ground-truth or expected output. + *args (Any): Optional positional arguments for customization. + **kwargs (Any): Optional keyword arguments for customization. + + Returns: + float: A numerical score representing how well the prediction + matches the actual output. The interpretation of the score + depends on the specific metric implementation. + + Raises: + NotImplementedError: If the subclass does not implement this method. + """ + ...