Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
8deea72
feat: Add pre-optimization summary display (Issue #19)
siddhantparadox Jun 27, 2025
fad488c
feat: validate minimum records in dataset file
Sangamesh26 Jun 28, 2025
85c4c97
chore: lint code
Sangamesh26 Jun 28, 2025
cb6bff5
feat: Add pre-optimization summary and fix CI failures
siddhantparadox Jul 2, 2025
1620a37
Merge pull request #28 from Sangamesh26/feature/validation-min-record…
heyjustinai Jul 2, 2025
0a7abf6
reimplemented baseline scoring and use test set
heyjustinai Jul 3, 2025
c9032e6
feat: Add pre-optimization summary display (Issue #19)
siddhantparadox Jun 27, 2025
ac3b647
feat: Add pre-optimization summary and fix CI failures
siddhantparadox Jul 2, 2025
4a43302
reimplemented baseline scoring and use test set
heyjustinai Jul 3, 2025
61ab1a4
Merge branch 'feature/pre-optimization-summary' of https://github.com…
heyjustinai Jul 3, 2025
e318171
temp file fix in test datasets
heyjustinai Jul 3, 2025
7b04df4
enhance task and prompt model names for strategy configurations
heyjustinai Jul 3, 2025
dda5d15
refactor pre-optimization summary handling in prompt_strategy
heyjustinai Jul 3, 2025
c54b8c0
update baseline computation settings and enhance strategy parameter e…
heyjustinai Jul 3, 2025
7e1a799
remove pre opt summary duplication
heyjustinai Jul 3, 2025
f0a81de
refactor logging configuration in CLI to allow dynamic log level adju…
heyjustinai Jul 3, 2025
7bb6487
refactor pre-optimization summary test to validate summary creation a…
heyjustinai Jul 3, 2025
f86e849
fix: resolve CI test failures by fixing atexit handler and removing l…
heyjustinai Jul 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions docs/advanced/logging.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,47 @@ The exported JSON file will contain two main sections:

- `timings`: The duration of each major phase of the optimization process.
- `metrics`: A list of metrics logged during the process, including the metric key, value, and step.

## Pre-Optimization Summary

Before starting the optimization process, `llama-prompt-ops` displays a comprehensive summary of the optimization configuration. This summary provides transparency into what will happen during optimization and helps with debugging and reproducibility.

### Summary Contents

The pre-optimization summary includes:

- **Task Model**: The model used for executing the task
- **Proposer Model**: The model used for generating instruction proposals (MIPROv2)
- **Metric**: The evaluation metric being used
- **Train / Val size**: Number of training and validation examples
- **MIPRO Params**: Key MIPROv2 optimization parameters
- **Guidance**: Any custom instruction tips provided to the proposer
- **Baseline score**: Performance of the original prompt before optimization

### Example Output

```
=== Pre-Optimization Summary ===
Task Model : openai/gpt-4o-mini
Proposer Model : openai/gpt-4o
Metric : facility_metric
Train / Val size : 100 / 25
MIPRO Params : {"auto_user":"basic","auto_dspy":"light","max_labeled_demos":5,"max_bootstrapped_demos":4,"num_candidates":10,"num_threads":18,"init_temperature":0.5,"seed":9}
Guidance : Use chain-of-thought reasoning and show your work step by step...
Baseline score : 0.4200
```

### Controlling Visibility

The pre-optimization summary is logged at `INFO` level. To see it, ensure your log level is set to `INFO` or lower:

```bash
# Via command line
llama-prompt-ops migrate --config config.yaml --log-level INFO

# Via environment variable
export PROMPT_OPS_LOG_LEVEL=INFO
llama-prompt-ops migrate --config config.yaml
```

The summary provides valuable context for understanding optimization results and can help identify configuration issues before the optimization process begins.
7 changes: 7 additions & 0 deletions src/llama_prompt_ops/core/migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,15 @@ def optimize(
if hasattr(self.strategy, "valset") and valset:
self.strategy.valset = valset

if hasattr(self.strategy, "testset") and testset:
self.strategy.testset = testset

self.logger.progress(
f"Applying {self.strategy.__class__.__name__} to optimize prompt"
)
self.logger.progress(f"Training set size: {len(trainset) if trainset else 0}")
self.logger.progress(f"Validation set size: {len(valset) if valset else 0}")
self.logger.progress(f"Test set size: {len(testset) if testset else 0}")

with self.logger.phase("Running optimization strategy"):
optimized_program = self.strategy.run(prompt_data)
Expand Down Expand Up @@ -215,6 +219,9 @@ def load_dataset_with_adapter(
if hasattr(self.strategy, "valset") and self.valset:
self.strategy.valset = self.valset

if hasattr(self.strategy, "testset") and self.testset:
self.strategy.testset = self.testset

return self.trainset, self.valset, self.testset

def evaluate(
Expand Down
1 change: 1 addition & 0 deletions src/llama_prompt_ops/core/model_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:

# Ensure the base strategy has the latest models and datasets
# This is important because these might be set after initialization
# and the pre-optimization summary needs them
self.base_strategy.task_model = self.task_model
self.base_strategy.prompt_model = self.prompt_model
self.base_strategy.trainset = self.trainset
Expand Down
201 changes: 177 additions & 24 deletions src/llama_prompt_ops/core/prompt_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@
import logging
import os
import sys
import time
import traceback
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional, Union

import dspy
from typing_extensions import Literal

from .evaluation import create_evaluator
from .utils import map_auto_mode_to_dspy
from .utils.telemetry import PreOptimizationSummary


class OptimizationError(Exception):
Expand Down Expand Up @@ -144,6 +147,8 @@ def __init__(
fewshot_aware_proposer: bool = True,
use_llama_tips: bool = True,
requires_permission_to_run: bool = False,
# Baseline computation settings
compute_baseline: bool = False,
**kwargs,
):
"""
Expand Down Expand Up @@ -179,7 +184,9 @@ def __init__(
tip_aware_proposer: Whether to use tip-aware instruction proposals
fewshot_aware_proposer: Whether to use few-shot aware instruction proposals
requires_permission_to_run: Whether to require user permission to run
provide_traceback: Whether to provide tracebacks for errors

# Baseline computation parameters
compute_baseline: Whether to compute baseline score before optimization

**kwargs: Additional configuration parameters
"""
Expand All @@ -192,6 +199,7 @@ def __init__(
# Training and validation data
self.trainset = kwargs.get("trainset", [])
self.valset = kwargs.get("valset", [])
self.testset = kwargs.get("testset", [])

# Model-specific optimization settings
self.use_llama_tips = use_llama_tips
Expand Down Expand Up @@ -221,6 +229,120 @@ def __init__(
self.fewshot_aware_proposer = fewshot_aware_proposer
self.requires_permission_to_run = requires_permission_to_run

# Baseline computation settings
self.compute_baseline = compute_baseline

def _get_model_name(self, model) -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we want to make sure to add type safety here, and also not violates the open/closed principle

"""
Extract a human-readable name from a model object.

Args:
model: The model object (could be a DSPy model, adapter, or string)

Returns:
A string representation of the model name
"""
if model is None:
return "None"

# Try to get model_name attribute first
if hasattr(model, "model_name"):
return str(model.model_name)

# Try to get model attribute (for adapters)
if hasattr(model, "model"):
return str(model.model)

# For DSPyModelAdapter, try to get the underlying model name
if hasattr(model, "_model") and hasattr(model._model, "model"):
return str(model._model.model)

# Fall back to string representation
return str(model)

def _create_signature(self, prompt_data: Dict[str, Any], instructions: str):
"""
Create a DSPy signature with explicit field definitions.

Args:
prompt_data: Dictionary containing inputs and outputs field definitions
instructions: The instruction text for the signature

Returns:
DSPy signature class
"""
# Create a signature class dynamically with proper field definitions
input_fields = {}
output_fields = {}

# Define input and output fields based on prompt_data
for field in prompt_data.get("inputs", ["question"]):
input_fields[field] = dspy.InputField(desc="${" + field + "}")
for field in prompt_data.get("outputs", ["answer"]):
output_fields[field] = dspy.OutputField(desc="${" + field + "}")

# Create the signature class with proper field definitions
DynamicSignature = type(
"DynamicSignature",
(dspy.Signature,),
{
**input_fields,
**output_fields,
"__doc__": instructions, # Store the instructions as the docstring
},
)

return DynamicSignature

def _compute_baseline_score(self, prompt_data: Dict[str, Any]) -> Optional[float]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use built in evaluator class here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably have this seperately as well i. single responsibility principle ii. hard to reuse it accorss different strategy iii. allow testing in isolation

"""
Compute baseline score using the original prompt before optimization.
Uses testset to avoid data leakage and evaluation.py for consistency.

Args:
prompt_data: Dictionary containing the prompt text and metadata

Returns:
Baseline score as float, or None if computation fails or is not possible
"""
if not self.metric or not self.testset:
logging.debug("Skipping baseline computation: missing metric or test set")
return None

if not self.compute_baseline:
logging.debug("Baseline computation disabled")
return None

try:
start_time = time.time()
logging.info("Computing baseline score using testset...")

# Use consistent signature creation with original prompt
baseline_signature = self._create_signature(
prompt_data, prompt_data["text"]
)
baseline_program = dspy.Predict(baseline_signature)

# Leverage existing evaluation infrastructure
evaluator = create_evaluator(
metric=self.metric,
devset=self.testset,
display_progress=False,
display_table=False,
)

score = evaluator.evaluate(baseline_program)
duration = time.time() - start_time

logging.info(
f"Baseline evaluation completed in {duration:.2f}s: {score:.3f}"
)
return float(score)

except Exception as e:
logging.warning(f"Baseline evaluation failed: {e}")
return None

def run(self, prompt_data: Dict[str, Any]) -> Any:
"""
Apply basic optimization to the prompt using DSPy's MIPROv2.
Expand All @@ -237,6 +359,54 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
if "dspy" not in globals() or not self.trainset:
return f"[Optimized for {self.model_name}] {text}"

# Display pre-optimization summary
try:
# Collect guidance information
guidance = None
if (
hasattr(self, "proposer_kwargs")
and self.proposer_kwargs
and "tip" in self.proposer_kwargs
):
guidance = self.proposer_kwargs["tip"]

# Compute baseline score if enabled
baseline_score = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Core requirement for baseline score is disabled

we still need to compute baseline score

if self.compute_baseline:
try:
baseline_score = self._compute_baseline_score(prompt_data)
except Exception as baseline_e:
logging.warning(f"Failed to compute baseline score: {baseline_e}")
baseline_score = None

# Create and display the pre-optimization summary
summary = PreOptimizationSummary(
task_model=self._get_model_name(self.task_model),
proposer_model=self._get_model_name(self.prompt_model),
metric_name=(
getattr(self.metric, "__name__", str(self.metric))
if self.metric
else "None"
),
train_size=len(self.trainset or []),
val_size=len(self.valset or []),
mipro_params={
"auto_user": self.auto,
"auto_dspy": map_auto_mode_to_dspy(self.auto),
"max_labeled_demos": self.max_labeled_demos,
"max_bootstrapped_demos": self.max_bootstrapped_demos,
"num_candidates": self.num_candidates,
"num_threads": self.num_threads,
"init_temperature": self.init_temperature,
"seed": self.seed,
},
guidance=guidance,
baseline_score=baseline_score,
)
summary.log()
except Exception as e:
logging.warning(f"Failed to display pre-optimization summary: {str(e)}")

try:
# Add model-specific tips to the prompt if enabled
model_tips = None
Expand Down Expand Up @@ -269,29 +439,12 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:

# Update the prompt text in prompt_data
prompt_data["text"] = text
# Create a signature class dynamically with proper field definitions
input_fields = {}
output_fields = {}

# Define input and output fields based on prompt_data
for field in prompt_data.get("inputs", ["question"]):
input_fields[field] = dspy.InputField(desc="${" + field + "}")
for field in prompt_data.get("outputs", ["answer"]):
output_fields[field] = dspy.OutputField(desc="${" + field + "}")

# Create the signature class with proper field definitions
DynamicSignature = type(
"DynamicSignature",
(dspy.Signature,),
{
**input_fields,
**output_fields,
"__doc__": text, # Store the instructions as the docstring
},
)

# Create signature using consistent helper method with enhanced prompt
signature = self._create_signature(prompt_data, text)

# Create program instance with the signature
program = dspy.Predict(DynamicSignature)
program = dspy.Predict(signature)

# Map our naming convention to DSPy's expected values
dspy_auto_mode = map_auto_mode_to_dspy(self.auto)
Expand Down Expand Up @@ -338,7 +491,7 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
# Use our custom instruction tips with highest priority
optimizer.proposer_kwargs["tip"] = self.proposer_kwargs["tip"]
logging.info(
f"Using custom instruction tips: {self.proposer_kwargs['tip']}..."
f"Using custom instruction tips: {self.proposer_kwargs['tip'][:50] if self.proposer_kwargs['tip'] else 'None'}"
)
# Otherwise, if we have model-specific tips, use those
elif model_tips:
Expand All @@ -355,7 +508,7 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
)

logging.info(
f"Compiling program with {len(self.trainset)} training examples and {len(self.valset)} validation examples"
f"Compiling program with {len(self.trainset)} training examples, {len(self.valset)} validation examples, and {len(self.testset)} test examples"
)

# Create a custom compile method that injects our tip directly
Expand Down
2 changes: 2 additions & 0 deletions src/llama_prompt_ops/core/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
from .format_utils import convert_json_to_yaml, json_to_yaml_file
from .logging import get_logger
from .strategy_utils import map_auto_mode_to_dspy
from .telemetry import PreOptimizationSummary

__all__ = [
"map_auto_mode_to_dspy",
"convert_json_to_yaml",
"json_to_yaml_file",
"get_logger",
"PreOptimizationSummary",
]
Loading
Loading