Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions src/cloudwatch-applicationsignals-mcp-server/evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,23 @@ Currently used for evaluating CloudWatch Application Signals MCP tools. Designed

### Running Evals

Run the below commands from the `src/cloudwatch-applicationsignals-mcp-server` directory.

```bash
# List all available tasks
python -m evals applicationsignals --list
python -m evals tasks --list

# Run specific task by ID
python -m evals applicationsignals --task-id <task_id>
python -m evals tasks --task-id <task_id>

# Run all tasks from a task file
python -m evals applicationsignals --task <task_file>
python -m evals tasks --task <task_file>

# Run with verbose logging
python -m evals applicationsignals --task-id <task_id> -v
python -m evals tasks --task-id <task_id> -v

# Skip cleanup (useful for inspecting changes)
python -m evals applicationsignals --task-id <task_id> --no-cleanup
python -m evals tasks --task-id <task_id> --no-cleanup
```

### Configuration
Expand All @@ -49,7 +51,7 @@ Example:
export MCP_EVAL_MODEL_ID=us.anthropic.claude-sonnet-4-20250514-v1:0
export MCP_EVAL_MAX_TURNS=30
export MCP_CLOUDWATCH_APPLICATION_SIGNALS_LOG_LEVEL=DEBUG # For debugging server issues
python -m evals applicationsignals --task-id my_task
python -m evals tasks --task-id my_task
```

### Creating Task Files
Expand Down
20 changes: 15 additions & 5 deletions src/cloudwatch-applicationsignals-mcp-server/evals/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,14 @@ def _discover_tasks(task_dir: Path) -> tuple[List[Task], Dict[str, List[Task]]]:
if task_dir_str not in sys.path:
sys.path.insert(0, task_dir_str)

task_modules = [f.stem for f in task_dir.glob('*_tasks.py')]
logger.debug(f'Discovered task modules in {task_dir}: {task_modules}')
task_files = list(task_dir.rglob('*_tasks.py'))
logger.debug(f'Discovered task files in {task_dir}: {task_files}')

for task_file in task_files:
# Convert file path to module name relative to task_dir
rel_path = task_file.relative_to(task_dir)
module_name = str(rel_path.with_suffix('')).replace('/', '.')

for module_name in task_modules:
try:
module = importlib.import_module(module_name)

Expand Down Expand Up @@ -99,17 +103,23 @@ def _discover_tasks(task_dir: Path) -> tuple[List[Task], Dict[str, List[Task]]]:
return all_tasks, tasks_by_module


def _report_task_results(task: Task, result: TaskResult) -> None:
def _report_task_results(task: Task, result: TaskResult, verbose: bool = False) -> None:
"""Report results for a single task.

Args:
task: Task instance
result: TaskResult from EvalRunner
verbose: If True, include captured data in output
"""
# TODO: Export detailed results to file and print only brief summary (pass/fail).
# Need more usage/feedback to determine what belongs in summary vs detailed report.
print(result)

if verbose:
print('\n')
print(result.get_captured_data_str())
print('\n')


async def main():
"""Entry point for eval script."""
Expand Down Expand Up @@ -213,7 +223,7 @@ async def main():

# Report results
for task, result in zip(tasks, results):
_report_task_results(task, result)
_report_task_results(task, result, verbose=args.verbose)

# TODO: Investigate more reliable subprocess cleanup mechanism
# Give subprocess time to clean up before event loop closes (Python < 3.11)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
Validator,
LLMJudgeValidator,
BuildValidator,
ToolCallValidator,
)
from .validation_prompts import ValidationPromptType
from .llm_provider import LLMProvider, BedrockLLMProvider
Expand Down Expand Up @@ -83,6 +84,7 @@
# Built-in validators
'LLMJudgeValidator',
'BuildValidator',
'ToolCallValidator',
'ValidationPromptType',
# Captured data constants
'GIT_DIFF',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,14 @@ def capture(
result = self.process_executor.run(
['git', 'diff', '--'] + full_paths,
timeout=10,
cwd=str(project_root),
)
else:
# Capture all changes if no specific paths provided
result = self.process_executor.run(
['git', 'diff'],
timeout=10,
cwd=str(project_root),
)
return {GIT_DIFF: result.stdout}
except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ async def run_conversation(
tool_input = tool_use['input']
tool_use_id = tool_use['toolUseId']

logger.debug(f'Tool requested: {tool_name}')
logger.debug(f'Tool requested: {tool_name} with {tool_input}')

tool_input['toolUseId'] = tool_use_id
result = await execute_tool(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
FILE_TOOL_LIST_FILES = 'list_files'
FILE_TOOL_READ_FILE = 'read_file'
FILE_TOOL_WRITE_FILE = 'write_file'
PERMITTED_FILE_TOOLS = {FILE_TOOL_LIST_FILES, FILE_TOOL_READ_FILE, FILE_TOOL_WRITE_FILE}


def get_file_tools() -> List[Dict[str, Any]]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,15 @@ def __init__(
if bedrock_client is None:
import boto3
from .eval_config import AWS_REGION
from botocore.config import Config

region = region_name or AWS_REGION
self.bedrock_client = boto3.client(service_name='bedrock-runtime', region_name=region)
config = Config(
max_pool_connections=5, retries={'max_attempts': 5, 'mode': 'adaptive'}
)
self.bedrock_client = boto3.client(
service_name='bedrock-runtime', region_name=region, config=config
)
else:
self.bedrock_client = bedrock_client
self.model_id = model_id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,32 @@ def from_error(cls, task_id: str, error: str) -> 'TaskResult':
error=error,
)

def get_captured_data_str(self) -> str:
"""Get string representation of captured_data for debug reporting.

Returns:
Formatted string representation of all captured data
"""
import json

if not self.captured_data:
return 'No captured data'

lines = ['Captured Data:', '=' * 40]

for key, value in self.captured_data.items():
lines.append(f'\n{key}:')
lines.append('-' * 40)
try:
if isinstance(value, (dict, list)):
lines.append(json.dumps(value, indent=2, default=str))
else:
lines.append(str(value))
except Exception as e:
lines.append(f'<Error formatting data: {e}>')

return '\n'.join(lines)

def __str__(self) -> str:
"""Format result as a human-readable string."""
lines = [
Expand Down Expand Up @@ -156,6 +182,8 @@ def __str__(self) -> str:
for criterion_result in criteria_results:
status_text = criterion_result['status']
lines.append(f' [{status_text}] {criterion_result["criterion"]}')
if criterion_result.get('reasoning'):
lines.append(f' Reasoning: {criterion_result["reasoning"]}')

status = '✅ PASS' if self.success else '❌ FAIL'
lines.extend(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
ROLE_USER,
TOOL_CALLS,
)
from .file_tools import PERMITTED_FILE_TOOLS
from .llm_provider import LLMProvider
from .validation_prompts import ValidationPromptType
from abc import ABC, abstractmethod
Expand Down Expand Up @@ -235,6 +236,86 @@ def _parse_llm_response(self, response_text: str, rubric: List[str]) -> List[Cri
return criteria_results


class ToolCallValidator(Validator):
"""Validator that checks tool call ordering."""

def __init__(self, expected_tool_calls: List[List[str]], ignore_file_tools: bool = False):
"""Initialize tool call validator.

Args:
expected_tool_calls: List of possible tool call sequences (list of lists).
Only one sequence needs to match exactly.
ignore_file_tools: If True, filter out file-related tools before validation
"""
self.expected_tool_calls = expected_tool_calls
self.ignore_file_tools = ignore_file_tools

def get_name(self) -> str:
"""Return validator name."""
return 'Tool Call'

async def validate(
self,
captured_data: Dict[str, Any],
) -> ValidationResult:
"""Validate tool calls match one of the expected sequences."""
logger.info('Validating tool calls...')

tool_calls = captured_data.get(TOOL_CALLS, [])
called_tools = [call['name'] for call in tool_calls]

# Filter out file tools if requested
if self.ignore_file_tools:
called_tools = [tool for tool in called_tools if tool not in PERMITTED_FILE_TOOLS]

# Check if any expected sequence matches
matched_sequence = None
for expected_sequence in self.expected_tool_calls:
if called_tools == expected_sequence:
matched_sequence = expected_sequence
break

if matched_sequence is not None:
return {
'validator_name': self.get_name(),
'overall_pass': True,
'criteria_results': [
{
'criterion': 'Tools called in one of expected orders',
'status': 'PASS',
'reasoning': f'Matched sequence: {" → ".join(matched_sequence)}',
}
],
'raw_validation_output': {
'expected_tool_calls': self.expected_tool_calls,
'called_tools': called_tools,
'matched_sequence': matched_sequence,
'ignore_file_tools': self.ignore_file_tools,
},
}
else:
expected_sequences_str = ' OR '.join(
[f'[{" → ".join(seq)}]' for seq in self.expected_tool_calls]
)
return {
'validator_name': self.get_name(),
'overall_pass': False,
'criteria_results': [
{
'criterion': 'Tools called in one of expected orders',
'status': 'FAIL',
'reasoning': f'Expected one of: {expected_sequences_str}, got: [{" → ".join(called_tools)}]',
}
],
'raw_validation_output': {
'expected_tool_calls': self.expected_tool_calls,
'called_tools': called_tools,
'matched_sequence': None,
'ignore_file_tools': self.ignore_file_tools,
},
}


class BuildValidator(Validator):
"""Validator that runs build commands and checks exit code."""

Expand Down
Loading