meta-llama · heyjustinai · Jul 5, 2025 · Jun 27, 2025 · Jun 28, 2025 · Jun 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -111,3 +111,5 @@ use-cases/joule/
 use-cases/dox/
 datasets/dropbox_qa/dropbox_data.json
 publish.sh
+
+scripts/
diff --git a/docs/advanced/logging.md b/docs/advanced/logging.md
@@ -43,3 +43,47 @@ The exported JSON file will contain two main sections:
 
 -   `timings`: The duration of each major phase of the optimization process.
 -   `metrics`: A list of metrics logged during the process, including the metric key, value, and step.
+
+## Pre-Optimization Summary
+
+Before starting the optimization process, `llama-prompt-ops` displays a comprehensive summary of the optimization configuration. This summary provides transparency into what will happen during optimization and helps with debugging and reproducibility.
+
+### Summary Contents
+
+The pre-optimization summary includes:
+
+- **Task Model**: The model used for executing the task
+- **Proposer Model**: The model used for generating instruction proposals (MIPROv2)
+- **Metric**: The evaluation metric being used
+- **Train / Val size**: Number of training and validation examples
+- **MIPRO Params**: Key MIPROv2 optimization parameters
+- **Guidance**: Any custom instruction tips provided to the proposer
+- **Baseline score**: Performance of the original prompt before optimization
+
+### Example Output
+
+```
+=== Pre-Optimization Summary ===
+    Task Model       : openai/gpt-4o-mini
+    Proposer Model   : openai/gpt-4o
+    Metric           : facility_metric
+    Train / Val size : 100 / 25
+    MIPRO Params     : {"auto_user":"basic","auto_dspy":"light","max_labeled_demos":5,"max_bootstrapped_demos":4,"num_candidates":10,"num_threads":18,"init_temperature":0.5,"seed":9}
+    Guidance         : Use chain-of-thought reasoning and show your work step by step...
+    Baseline score   : 0.4200
+```
+
+### Controlling Visibility
+
+The pre-optimization summary is logged at `INFO` level. To see it, ensure your log level is set to `INFO` or lower:
+
+```bash
+# Via command line
+llama-prompt-ops migrate --config config.yaml --log-level INFO
+
+# Via environment variable
+export PROMPT_OPS_LOG_LEVEL=INFO
+llama-prompt-ops migrate --config config.yaml
+```
+
+The summary provides valuable context for understanding optimization results and can help identify configuration issues before the optimization process begins.
diff --git a/src/llama_prompt_ops/core/migrator.py b/src/llama_prompt_ops/core/migrator.py
@@ -160,11 +160,15 @@ def optimize(
         if hasattr(self.strategy, "valset") and valset:
             self.strategy.valset = valset
 
+        if hasattr(self.strategy, "testset") and testset:
+            self.strategy.testset = testset
+
         self.logger.progress(
             f"Applying {self.strategy.__class__.__name__} to optimize prompt"
         )
         self.logger.progress(f"Training set size: {len(trainset) if trainset else 0}")
         self.logger.progress(f"Validation set size: {len(valset) if valset else 0}")
+        self.logger.progress(f"Test set size: {len(testset) if testset else 0}")
 
         with self.logger.phase("Running optimization strategy"):
             optimized_program = self.strategy.run(prompt_data)
@@ -215,6 +219,9 @@ def load_dataset_with_adapter(
         if hasattr(self.strategy, "valset") and self.valset:
             self.strategy.valset = self.valset
 
+        if hasattr(self.strategy, "testset") and self.testset:
+            self.strategy.testset = self.testset
+
         return self.trainset, self.valset, self.testset
 
     def evaluate(

diff --git a/src/llama_prompt_ops/core/model_strategies.py b/src/llama_prompt_ops/core/model_strategies.py
@@ -45,6 +45,8 @@ def __init__(
         max_bootstrapped_demos: int = 4,
         max_labeled_demos: int = 5,
         auto: Optional[Literal["basic", "intermediate", "advanced"]] = "basic",
+        task_model_name: Optional[str] = None,
+        prompt_model_name: Optional[str] = None,
         **kwargs,
     ):
         """
@@ -60,6 +62,8 @@ def __init__(
             max_bootstrapped_demos: Maximum number of bootstrapped demos for MIPROv2
             max_labeled_demos: Maximum number of labeled demos for MIPROv2
             auto: Auto mode for MIPROv2 (basic, intermediate, advanced)
+            task_model_name: Name of the task model (for display purposes)
+            prompt_model_name: Name of the prompt/proposer model (for display purposes)
             **kwargs: Additional parameters for BasicOptimizationStrategy
         """
         # Verify that the model is a Llama model
@@ -77,6 +81,8 @@ def __init__(
             max_bootstrapped_demos=max_bootstrapped_demos,
             max_labeled_demos=max_labeled_demos,
             auto=auto,
+            task_model_name=task_model_name,
+            prompt_model_name=prompt_model_name,
             **kwargs,
         )
 
@@ -145,6 +151,7 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
 
         # Ensure the base strategy has the latest models and datasets
         # This is important because these might be set after initialization
+        # and the pre-optimization summary needs them
         self.base_strategy.task_model = self.task_model
         self.base_strategy.prompt_model = self.prompt_model
         self.base_strategy.trainset = self.trainset

diff --git a/src/llama_prompt_ops/core/prompt_strategies.py b/src/llama_prompt_ops/core/prompt_strategies.py
@@ -14,13 +14,15 @@
 import logging
 import os
 import sys
+import time
 import traceback
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import dspy
 from typing_extensions import Literal
 
+from .evaluation import create_evaluator
 from .utils import map_auto_mode_to_dspy
 
 
@@ -144,6 +146,11 @@ def __init__(
         fewshot_aware_proposer: bool = True,
         use_llama_tips: bool = True,
         requires_permission_to_run: bool = False,
+        # Baseline computation settings
+        compute_baseline: bool = True,
+        # Model name parameters for display
+        task_model_name: Optional[str] = None,
+        prompt_model_name: Optional[str] = None,
         **kwargs,
     ):
         """
@@ -179,7 +186,13 @@ def __init__(
             tip_aware_proposer: Whether to use tip-aware instruction proposals
             fewshot_aware_proposer: Whether to use few-shot aware instruction proposals
             requires_permission_to_run: Whether to require user permission to run
-            provide_traceback: Whether to provide tracebacks for errors
+
+            # Baseline computation parameters
+            compute_baseline: Whether to compute baseline score before optimization
+
+            # Model name parameters for display
+            task_model_name: Name of the task model
+            prompt_model_name: Name of the prompt model
 
             **kwargs: Additional configuration parameters
         """
@@ -192,6 +205,7 @@ def __init__(
         # Training and validation data
         self.trainset = kwargs.get("trainset", [])
         self.valset = kwargs.get("valset", [])
+        self.testset = kwargs.get("testset", [])
 
         # Model-specific optimization settings
         self.use_llama_tips = use_llama_tips
@@ -221,6 +235,127 @@ def __init__(
         self.fewshot_aware_proposer = fewshot_aware_proposer
         self.requires_permission_to_run = requires_permission_to_run
 
+        # Baseline computation settings
+        self.compute_baseline = compute_baseline
+
+        # Model name parameters for display
+        self.task_model_name = task_model_name
+        self.prompt_model_name = prompt_model_name
+
+    def _get_model_name(self, model) -> str:
+        """
+        Get a human-readable name for a model using stored names.
+
+        Args:
+            model: The model object to get the name for
+
+        Returns:
+            A string representation of the model name
+        """
+        if model is None:
+            return "None"
+
+        # Use stored model names if available
+        if model is self.task_model and self.task_model_name:
+            return self.task_model_name
+        if model is self.prompt_model and self.prompt_model_name:
+            return self.prompt_model_name
+
+        # Fallback to legacy introspection for backward compatibility
+        if hasattr(model, "model_name"):
+            return str(model.model_name)
+        if hasattr(model, "model"):
+            return str(model.model)
+        if hasattr(model, "_model") and hasattr(model._model, "model"):
+            return str(model._model.model)
+
+        # Final fallback
+        return str(model)
+
+    def _create_signature(self, prompt_data: Dict[str, Any], instructions: str):
+        """
+        Create a DSPy signature with explicit field definitions.
+
+        Args:
+            prompt_data: Dictionary containing inputs and outputs field definitions
+            instructions: The instruction text for the signature
+
+        Returns:
+            DSPy signature class
+        """
+        # Create a signature class dynamically with proper field definitions
+        input_fields = {}
+        output_fields = {}
+
+        # Define input and output fields based on prompt_data
+        for field in prompt_data.get("inputs", ["question"]):
+            input_fields[field] = dspy.InputField(desc="${" + field + "}")
+        for field in prompt_data.get("outputs", ["answer"]):
+            output_fields[field] = dspy.OutputField(desc="${" + field + "}")
+
+        # Create the signature class with proper field definitions
+        DynamicSignature = type(
+            "DynamicSignature",
+            (dspy.Signature,),
+            {
+                **input_fields,
+                **output_fields,
+                "__doc__": instructions,  # Store the instructions as the docstring
+            },
+        )
+
+        return DynamicSignature
+
+    def _compute_baseline_score(self, prompt_data: Dict[str, Any]) -> Optional[float]:
+        """
+        Compute baseline score using the original prompt before optimization.
+        Uses testset to avoid data leakage and evaluation.py for consistency.
+
+        Args:
+            prompt_data: Dictionary containing the prompt text and metadata
+
+        Returns:
+            Baseline score as float, or None if computation fails or is not possible
+        """
+        if not self.metric or not self.testset:
+            logging.debug("Skipping baseline computation: missing metric or test set")
+            return None
+
+        if not self.compute_baseline:
+            logging.debug("Baseline computation disabled")
+            return None
+
+        try:
+            start_time = time.time()
+
+            # Use consistent signature creation with original prompt
+            baseline_signature = self._create_signature(
+                prompt_data, prompt_data["text"]
+            )
+            baseline_program = dspy.Predict(baseline_signature)
+
+            print(
+                f"\nComputing baseline score on {len(self.testset)} test examples using {self.num_threads} threads..."
+            )
+
+            evaluator = create_evaluator(
+                metric=self.metric,
+                devset=self.testset,
+                num_threads=self.num_threads,  # Use the strategy's num_threads setting
+                display_progress=True,
+                display_table=False,
+            )
+
+            score = evaluator.evaluate(baseline_program)
+            duration = time.time() - start_time
+
+            print(f"✅ Baseline Score: {score:.3f} in {duration:.2f}s\n")
+            return float(score)
+
+        except Exception as e:
+            logging.warning(f"Baseline evaluation failed: {e}")
+            return None
+
     def run(self, prompt_data: Dict[str, Any]) -> Any:
         """
         Apply basic optimization to the prompt using DSPy's MIPROv2.
@@ -237,6 +372,11 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
         if "dspy" not in globals() or not self.trainset:
             return f"[Optimized for {self.model_name}] {text}"
 
+        # Display pre-optimization summary using utility function
+        from .utils.summary_utils import create_and_display_summary
+
+        create_and_display_summary(self, prompt_data)
+
         try:
             # Add model-specific tips to the prompt if enabled
             model_tips = None
@@ -269,29 +409,12 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
 
             # Update the prompt text in prompt_data
             prompt_data["text"] = text
-            # Create a signature class dynamically with proper field definitions
-            input_fields = {}
-            output_fields = {}
-
-            # Define input and output fields based on prompt_data
-            for field in prompt_data.get("inputs", ["question"]):
-                input_fields[field] = dspy.InputField(desc="${" + field + "}")
-            for field in prompt_data.get("outputs", ["answer"]):
-                output_fields[field] = dspy.OutputField(desc="${" + field + "}")
-
-            # Create the signature class with proper field definitions
-            DynamicSignature = type(
-                "DynamicSignature",
-                (dspy.Signature,),
-                {
-                    **input_fields,
-                    **output_fields,
-                    "__doc__": text,  # Store the instructions as the docstring
-                },
-            )
+
+            # Create signature using consistent helper method with enhanced prompt
+            signature = self._create_signature(prompt_data, text)
 
             # Create program instance with the signature
-            program = dspy.Predict(DynamicSignature)
+            program = dspy.Predict(signature)
 
             # Map our naming convention to DSPy's expected values
             dspy_auto_mode = map_auto_mode_to_dspy(self.auto)
@@ -338,7 +461,7 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
                 # Use our custom instruction tips with highest priority
                 optimizer.proposer_kwargs["tip"] = self.proposer_kwargs["tip"]
                 logging.info(
-                    f"Using custom instruction tips: {self.proposer_kwargs['tip']}..."
+                    f"Using custom instruction tips: {self.proposer_kwargs['tip'][:50] if self.proposer_kwargs['tip'] else 'None'}"
                 )
             # Otherwise, if we have model-specific tips, use those
             elif model_tips:
@@ -355,7 +478,7 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
             )
 
             logging.info(
-                f"Compiling program with {len(self.trainset)} training examples and {len(self.valset)} validation examples"
+                f"Compiling program with {len(self.trainset)} training examples, {len(self.valset)} validation examples, and {len(self.testset)} test examples"
             )
 
             # Create a custom compile method that injects our tip directly

diff --git a/src/llama_prompt_ops/core/utils/__init__.py b/src/llama_prompt_ops/core/utils/__init__.py
@@ -10,10 +10,15 @@
 from .format_utils import convert_json_to_yaml, json_to_yaml_file
 from .logging import get_logger
 from .strategy_utils import map_auto_mode_to_dspy
+from .summary_utils import create_and_display_summary, create_pre_optimization_summary
+from .telemetry import PreOptimizationSummary
 
 __all__ = [
     "map_auto_mode_to_dspy",
     "convert_json_to_yaml",
     "json_to_yaml_file",
     "get_logger",
+    "PreOptimizationSummary",
+    "create_pre_optimization_summary",
+    "create_and_display_summary",
 ]