Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
8deea72
feat: Add pre-optimization summary display (Issue #19)
siddhantparadox Jun 27, 2025
fad488c
feat: validate minimum records in dataset file
Sangamesh26 Jun 28, 2025
85c4c97
chore: lint code
Sangamesh26 Jun 28, 2025
cb6bff5
feat: Add pre-optimization summary and fix CI failures
siddhantparadox Jul 2, 2025
1620a37
Merge pull request #28 from Sangamesh26/feature/validation-min-record…
heyjustinai Jul 2, 2025
0a7abf6
reimplemented baseline scoring and use test set
heyjustinai Jul 3, 2025
c9032e6
feat: Add pre-optimization summary display (Issue #19)
siddhantparadox Jun 27, 2025
ac3b647
feat: Add pre-optimization summary and fix CI failures
siddhantparadox Jul 2, 2025
4a43302
reimplemented baseline scoring and use test set
heyjustinai Jul 3, 2025
61ab1a4
Merge branch 'feature/pre-optimization-summary' of https://github.com…
heyjustinai Jul 3, 2025
e318171
temp file fix in test datasets
heyjustinai Jul 3, 2025
7b04df4
enhance task and prompt model names for strategy configurations
heyjustinai Jul 3, 2025
dda5d15
refactor pre-optimization summary handling in prompt_strategy
heyjustinai Jul 3, 2025
c54b8c0
update baseline computation settings and enhance strategy parameter e…
heyjustinai Jul 3, 2025
7e1a799
remove pre opt summary duplication
heyjustinai Jul 3, 2025
f0a81de
refactor logging configuration in CLI to allow dynamic log level adju…
heyjustinai Jul 3, 2025
7bb6487
refactor pre-optimization summary test to validate summary creation a…
heyjustinai Jul 3, 2025
f86e849
fix: resolve CI test failures by fixing atexit handler and removing l…
heyjustinai Jul 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,5 @@ use-cases/joule/
use-cases/dox/
datasets/dropbox_qa/dropbox_data.json
publish.sh

scripts/
44 changes: 44 additions & 0 deletions docs/advanced/logging.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,47 @@ The exported JSON file will contain two main sections:

- `timings`: The duration of each major phase of the optimization process.
- `metrics`: A list of metrics logged during the process, including the metric key, value, and step.

## Pre-Optimization Summary

Before starting the optimization process, `llama-prompt-ops` displays a comprehensive summary of the optimization configuration. This summary provides transparency into what will happen during optimization and helps with debugging and reproducibility.

### Summary Contents

The pre-optimization summary includes:

- **Task Model**: The model used for executing the task
- **Proposer Model**: The model used for generating instruction proposals (MIPROv2)
- **Metric**: The evaluation metric being used
- **Train / Val size**: Number of training and validation examples
- **MIPRO Params**: Key MIPROv2 optimization parameters
- **Guidance**: Any custom instruction tips provided to the proposer
- **Baseline score**: Performance of the original prompt before optimization

### Example Output

```
=== Pre-Optimization Summary ===
Task Model : openai/gpt-4o-mini
Proposer Model : openai/gpt-4o
Metric : facility_metric
Train / Val size : 100 / 25
MIPRO Params : {"auto_user":"basic","auto_dspy":"light","max_labeled_demos":5,"max_bootstrapped_demos":4,"num_candidates":10,"num_threads":18,"init_temperature":0.5,"seed":9}
Guidance : Use chain-of-thought reasoning and show your work step by step...
Baseline score : 0.4200
```

### Controlling Visibility

The pre-optimization summary is logged at `INFO` level. To see it, ensure your log level is set to `INFO` or lower:

```bash
# Via command line
llama-prompt-ops migrate --config config.yaml --log-level INFO

# Via environment variable
export PROMPT_OPS_LOG_LEVEL=INFO
llama-prompt-ops migrate --config config.yaml
```

The summary provides valuable context for understanding optimization results and can help identify configuration issues before the optimization process begins.
7 changes: 7 additions & 0 deletions src/llama_prompt_ops/core/migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,15 @@ def optimize(
if hasattr(self.strategy, "valset") and valset:
self.strategy.valset = valset

if hasattr(self.strategy, "testset") and testset:
self.strategy.testset = testset

self.logger.progress(
f"Applying {self.strategy.__class__.__name__} to optimize prompt"
)
self.logger.progress(f"Training set size: {len(trainset) if trainset else 0}")
self.logger.progress(f"Validation set size: {len(valset) if valset else 0}")
self.logger.progress(f"Test set size: {len(testset) if testset else 0}")

with self.logger.phase("Running optimization strategy"):
optimized_program = self.strategy.run(prompt_data)
Expand Down Expand Up @@ -215,6 +219,9 @@ def load_dataset_with_adapter(
if hasattr(self.strategy, "valset") and self.valset:
self.strategy.valset = self.valset

if hasattr(self.strategy, "testset") and self.testset:
self.strategy.testset = self.testset

return self.trainset, self.valset, self.testset

def evaluate(
Expand Down
7 changes: 7 additions & 0 deletions src/llama_prompt_ops/core/model_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def __init__(
max_bootstrapped_demos: int = 4,
max_labeled_demos: int = 5,
auto: Optional[Literal["basic", "intermediate", "advanced"]] = "basic",
task_model_name: Optional[str] = None,
prompt_model_name: Optional[str] = None,
**kwargs,
):
"""
Expand All @@ -60,6 +62,8 @@ def __init__(
max_bootstrapped_demos: Maximum number of bootstrapped demos for MIPROv2
max_labeled_demos: Maximum number of labeled demos for MIPROv2
auto: Auto mode for MIPROv2 (basic, intermediate, advanced)
task_model_name: Name of the task model (for display purposes)
prompt_model_name: Name of the prompt/proposer model (for display purposes)
**kwargs: Additional parameters for BasicOptimizationStrategy
"""
# Verify that the model is a Llama model
Expand All @@ -77,6 +81,8 @@ def __init__(
max_bootstrapped_demos=max_bootstrapped_demos,
max_labeled_demos=max_labeled_demos,
auto=auto,
task_model_name=task_model_name,
prompt_model_name=prompt_model_name,
**kwargs,
)

Expand Down Expand Up @@ -145,6 +151,7 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:

# Ensure the base strategy has the latest models and datasets
# This is important because these might be set after initialization
# and the pre-optimization summary needs them
self.base_strategy.task_model = self.task_model
self.base_strategy.prompt_model = self.prompt_model
self.base_strategy.trainset = self.trainset
Expand Down
171 changes: 147 additions & 24 deletions src/llama_prompt_ops/core/prompt_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@
import logging
import os
import sys
import time
import traceback
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional, Union

import dspy
from typing_extensions import Literal

from .evaluation import create_evaluator
from .utils import map_auto_mode_to_dspy


Expand Down Expand Up @@ -144,6 +146,11 @@ def __init__(
fewshot_aware_proposer: bool = True,
use_llama_tips: bool = True,
requires_permission_to_run: bool = False,
# Baseline computation settings
compute_baseline: bool = True,
# Model name parameters for display
task_model_name: Optional[str] = None,
prompt_model_name: Optional[str] = None,
**kwargs,
):
"""
Expand Down Expand Up @@ -179,7 +186,13 @@ def __init__(
tip_aware_proposer: Whether to use tip-aware instruction proposals
fewshot_aware_proposer: Whether to use few-shot aware instruction proposals
requires_permission_to_run: Whether to require user permission to run
provide_traceback: Whether to provide tracebacks for errors

# Baseline computation parameters
compute_baseline: Whether to compute baseline score before optimization

# Model name parameters for display
task_model_name: Name of the task model
prompt_model_name: Name of the prompt model

**kwargs: Additional configuration parameters
"""
Expand All @@ -192,6 +205,7 @@ def __init__(
# Training and validation data
self.trainset = kwargs.get("trainset", [])
self.valset = kwargs.get("valset", [])
self.testset = kwargs.get("testset", [])

# Model-specific optimization settings
self.use_llama_tips = use_llama_tips
Expand Down Expand Up @@ -221,6 +235,127 @@ def __init__(
self.fewshot_aware_proposer = fewshot_aware_proposer
self.requires_permission_to_run = requires_permission_to_run

# Baseline computation settings
self.compute_baseline = compute_baseline

# Model name parameters for display
self.task_model_name = task_model_name
self.prompt_model_name = prompt_model_name

def _get_model_name(self, model) -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we want to make sure to add type safety here, and also not violates the open/closed principle

"""
Get a human-readable name for a model using stored names.

Args:
model: The model object to get the name for

Returns:
A string representation of the model name
"""
if model is None:
return "None"

# Use stored model names if available
if model is self.task_model and self.task_model_name:
return self.task_model_name
if model is self.prompt_model and self.prompt_model_name:
return self.prompt_model_name

# Fallback to legacy introspection for backward compatibility
if hasattr(model, "model_name"):
return str(model.model_name)
if hasattr(model, "model"):
return str(model.model)
if hasattr(model, "_model") and hasattr(model._model, "model"):
return str(model._model.model)

# Final fallback
return str(model)

def _create_signature(self, prompt_data: Dict[str, Any], instructions: str):
"""
Create a DSPy signature with explicit field definitions.

Args:
prompt_data: Dictionary containing inputs and outputs field definitions
instructions: The instruction text for the signature

Returns:
DSPy signature class
"""
# Create a signature class dynamically with proper field definitions
input_fields = {}
output_fields = {}

# Define input and output fields based on prompt_data
for field in prompt_data.get("inputs", ["question"]):
input_fields[field] = dspy.InputField(desc="${" + field + "}")
for field in prompt_data.get("outputs", ["answer"]):
output_fields[field] = dspy.OutputField(desc="${" + field + "}")

# Create the signature class with proper field definitions
DynamicSignature = type(
"DynamicSignature",
(dspy.Signature,),
{
**input_fields,
**output_fields,
"__doc__": instructions, # Store the instructions as the docstring
},
)

return DynamicSignature

def _compute_baseline_score(self, prompt_data: Dict[str, Any]) -> Optional[float]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use built in evaluator class here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably have this seperately as well i. single responsibility principle ii. hard to reuse it accorss different strategy iii. allow testing in isolation

"""
Compute baseline score using the original prompt before optimization.
Uses testset to avoid data leakage and evaluation.py for consistency.

Args:
prompt_data: Dictionary containing the prompt text and metadata

Returns:
Baseline score as float, or None if computation fails or is not possible
"""
if not self.metric or not self.testset:
logging.debug("Skipping baseline computation: missing metric or test set")
return None

if not self.compute_baseline:
logging.debug("Baseline computation disabled")
return None

try:
start_time = time.time()

# Use consistent signature creation with original prompt
baseline_signature = self._create_signature(
prompt_data, prompt_data["text"]
)
baseline_program = dspy.Predict(baseline_signature)

print(
f"\nComputing baseline score on {len(self.testset)} test examples using {self.num_threads} threads..."
)

evaluator = create_evaluator(
metric=self.metric,
devset=self.testset,
num_threads=self.num_threads, # Use the strategy's num_threads setting
display_progress=True,
display_table=False,
)

score = evaluator.evaluate(baseline_program)
duration = time.time() - start_time

print(f"✅ Baseline Score: {score:.3f} in {duration:.2f}s\n")
return float(score)

except Exception as e:
logging.warning(f"Baseline evaluation failed: {e}")
return None

def run(self, prompt_data: Dict[str, Any]) -> Any:
"""
Apply basic optimization to the prompt using DSPy's MIPROv2.
Expand All @@ -237,6 +372,11 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
if "dspy" not in globals() or not self.trainset:
return f"[Optimized for {self.model_name}] {text}"

# Display pre-optimization summary using utility function
from .utils.summary_utils import create_and_display_summary

create_and_display_summary(self, prompt_data)

try:
# Add model-specific tips to the prompt if enabled
model_tips = None
Expand Down Expand Up @@ -269,29 +409,12 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:

# Update the prompt text in prompt_data
prompt_data["text"] = text
# Create a signature class dynamically with proper field definitions
input_fields = {}
output_fields = {}

# Define input and output fields based on prompt_data
for field in prompt_data.get("inputs", ["question"]):
input_fields[field] = dspy.InputField(desc="${" + field + "}")
for field in prompt_data.get("outputs", ["answer"]):
output_fields[field] = dspy.OutputField(desc="${" + field + "}")

# Create the signature class with proper field definitions
DynamicSignature = type(
"DynamicSignature",
(dspy.Signature,),
{
**input_fields,
**output_fields,
"__doc__": text, # Store the instructions as the docstring
},
)

# Create signature using consistent helper method with enhanced prompt
signature = self._create_signature(prompt_data, text)

# Create program instance with the signature
program = dspy.Predict(DynamicSignature)
program = dspy.Predict(signature)

# Map our naming convention to DSPy's expected values
dspy_auto_mode = map_auto_mode_to_dspy(self.auto)
Expand Down Expand Up @@ -338,7 +461,7 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
# Use our custom instruction tips with highest priority
optimizer.proposer_kwargs["tip"] = self.proposer_kwargs["tip"]
logging.info(
f"Using custom instruction tips: {self.proposer_kwargs['tip']}..."
f"Using custom instruction tips: {self.proposer_kwargs['tip'][:50] if self.proposer_kwargs['tip'] else 'None'}"
)
# Otherwise, if we have model-specific tips, use those
elif model_tips:
Expand All @@ -355,7 +478,7 @@ def run(self, prompt_data: Dict[str, Any]) -> Any:
)

logging.info(
f"Compiling program with {len(self.trainset)} training examples and {len(self.valset)} validation examples"
f"Compiling program with {len(self.trainset)} training examples, {len(self.valset)} validation examples, and {len(self.testset)} test examples"
)

# Create a custom compile method that injects our tip directly
Expand Down
5 changes: 5 additions & 0 deletions src/llama_prompt_ops/core/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,15 @@
from .format_utils import convert_json_to_yaml, json_to_yaml_file
from .logging import get_logger
from .strategy_utils import map_auto_mode_to_dspy
from .summary_utils import create_and_display_summary, create_pre_optimization_summary
from .telemetry import PreOptimizationSummary

__all__ = [
"map_auto_mode_to_dspy",
"convert_json_to_yaml",
"json_to_yaml_file",
"get_logger",
"PreOptimizationSummary",
"create_pre_optimization_summary",
"create_and_display_summary",
]
Loading