awslabs · thpierce · Nov 14, 2025 · Nov 12, 2025 · Nov 13, 2025 · Nov 13, 2025
@@ -13,21 +13,23 @@ Currently used for evaluating CloudWatch Application Signals MCP tools. Designed
 
 ### Running Evals
 
+Run the below commands from the `src/cloudwatch-applicationsignals-mcp-server` directory.
+
 ```bash
 # List all available tasks
-python -m evals applicationsignals --list
+python -m evals tasks --list
 
 # Run specific task by ID
-python -m evals applicationsignals --task-id <task_id>
+python -m evals tasks --task-id <task_id>
 
 # Run all tasks from a task file
-python -m evals applicationsignals --task <task_file>
+python -m evals tasks --task <task_file>
 
 # Run with verbose logging
-python -m evals applicationsignals --task-id <task_id> -v
+python -m evals tasks --task-id <task_id> -v
 
 # Skip cleanup (useful for inspecting changes)
-python -m evals applicationsignals --task-id <task_id> --no-cleanup
+python -m evals tasks --task-id <task_id> --no-cleanup
 ```
 
 ### Configuration
@@ -49,7 +51,7 @@ Example:
 export MCP_EVAL_MODEL_ID=us.anthropic.claude-sonnet-4-20250514-v1:0
 export MCP_EVAL_MAX_TURNS=30
 export MCP_CLOUDWATCH_APPLICATION_SIGNALS_LOG_LEVEL=DEBUG  # For debugging server issues
-python -m evals applicationsignals --task-id my_task
+python -m evals tasks --task-id my_task
 ```
 
 ### Creating Task Files

@@ -68,10 +68,14 @@ def _discover_tasks(task_dir: Path) -> tuple[List[Task], Dict[str, List[Task]]]:
     if task_dir_str not in sys.path:
         sys.path.insert(0, task_dir_str)
 
-    task_modules = [f.stem for f in task_dir.glob('*_tasks.py')]
-    logger.debug(f'Discovered task modules in {task_dir}: {task_modules}')
+    task_files = list(task_dir.rglob('*_tasks.py'))
+    logger.debug(f'Discovered task files in {task_dir}: {task_files}')
+
+    for task_file in task_files:
+        # Convert file path to module name relative to task_dir
+        rel_path = task_file.relative_to(task_dir)
+        module_name = str(rel_path.with_suffix('')).replace('/', '.')
 
-    for module_name in task_modules:
         try:
             module = importlib.import_module(module_name)
 
@@ -99,17 +103,23 @@ def _discover_tasks(task_dir: Path) -> tuple[List[Task], Dict[str, List[Task]]]:
     return all_tasks, tasks_by_module
 
 
-def _report_task_results(task: Task, result: TaskResult) -> None:
+def _report_task_results(task: Task, result: TaskResult, verbose: bool = False) -> None:
     """Report results for a single task.
 
     Args:
         task: Task instance
         result: TaskResult from EvalRunner
+        verbose: If True, include captured data in output
     """
     # TODO: Export detailed results to file and print only brief summary (pass/fail).
     # Need more usage/feedback to determine what belongs in summary vs detailed report.
     print(result)
 
+    if verbose:
+        print('\n')
+        print(result.get_captured_data_str())
+        print('\n')
+
 
 async def main():
     """Entry point for eval script."""
@@ -213,7 +223,7 @@ async def main():
 
         # Report results
         for task, result in zip(tasks, results):
-            _report_task_results(task, result)
+            _report_task_results(task, result, verbose=args.verbose)
 
         # TODO: Investigate more reliable subprocess cleanup mechanism
         # Give subprocess time to clean up before event loop closes (Python < 3.11)

@@ -44,6 +44,7 @@
     Validator,
     LLMJudgeValidator,
     BuildValidator,
+    ToolCallValidator,
 )
 from .validation_prompts import ValidationPromptType
 from .llm_provider import LLMProvider, BedrockLLMProvider
@@ -83,6 +84,7 @@
     # Built-in validators
     'LLMJudgeValidator',
     'BuildValidator',
+    'ToolCallValidator',
     'ValidationPromptType',
     # Captured data constants
     'GIT_DIFF',

@@ -86,12 +86,14 @@ def capture(
                 result = self.process_executor.run(
                     ['git', 'diff', '--'] + full_paths,
                     timeout=10,
+                    cwd=str(project_root),
                 )
             else:
                 # Capture all changes if no specific paths provided
                 result = self.process_executor.run(
                     ['git', 'diff'],
                     timeout=10,
+                    cwd=str(project_root),
                 )
             return {GIT_DIFF: result.stdout}
         except Exception as e:

@@ -232,7 +232,7 @@ async def run_conversation(
                         tool_input = tool_use['input']
                         tool_use_id = tool_use['toolUseId']
 
-                        logger.debug(f'Tool requested: {tool_name}')
+                        logger.debug(f'Tool requested: {tool_name} with {tool_input}')
 
                         tool_input['toolUseId'] = tool_use_id
                         result = await execute_tool(

@@ -24,6 +24,7 @@
 FILE_TOOL_LIST_FILES = 'list_files'
 FILE_TOOL_READ_FILE = 'read_file'
 FILE_TOOL_WRITE_FILE = 'write_file'
+PERMITTED_FILE_TOOLS = {FILE_TOOL_LIST_FILES, FILE_TOOL_READ_FILE, FILE_TOOL_WRITE_FILE}
 
 
 def get_file_tools() -> List[Dict[str, Any]]:

@@ -72,9 +72,15 @@ def __init__(
         if bedrock_client is None:
             import boto3
             from .eval_config import AWS_REGION
+            from botocore.config import Config
 
             region = region_name or AWS_REGION
-            self.bedrock_client = boto3.client(service_name='bedrock-runtime', region_name=region)
+            config = Config(
+                max_pool_connections=5, retries={'max_attempts': 5, 'mode': 'adaptive'}
+            )
+            self.bedrock_client = boto3.client(
+                service_name='bedrock-runtime', region_name=region, config=config
+            )
         else:
             self.bedrock_client = bedrock_client
         self.model_id = model_id

@@ -94,6 +94,32 @@ def from_error(cls, task_id: str, error: str) -> 'TaskResult':
             error=error,
         )
 
+    def get_captured_data_str(self) -> str:
+        """Get string representation of captured_data for debug reporting.
+
+        Returns:
+            Formatted string representation of all captured data
+        """
+        import json
+
+        if not self.captured_data:
+            return 'No captured data'
+
+        lines = ['Captured Data:', '=' * 40]
+
+        for key, value in self.captured_data.items():
+            lines.append(f'\n{key}:')
+            lines.append('-' * 40)
+            try:
+                if isinstance(value, (dict, list)):
+                    lines.append(json.dumps(value, indent=2, default=str))
+                else:
+                    lines.append(str(value))
+            except Exception as e:
+                lines.append(f'<Error formatting data: {e}>')
+
+        return '\n'.join(lines)
+
     def __str__(self) -> str:
         """Format result as a human-readable string."""
         lines = [
@@ -156,6 +182,8 @@ def __str__(self) -> str:
                     for criterion_result in criteria_results:
                         status_text = criterion_result['status']
                         lines.append(f'    [{status_text}] {criterion_result["criterion"]}')
+                        if criterion_result.get('reasoning'):
+                            lines.append(f'      Reasoning: {criterion_result["reasoning"]}')
 
         status = '✅ PASS' if self.success else '❌ FAIL'
         lines.extend(

@@ -25,6 +25,7 @@
     ROLE_USER,
     TOOL_CALLS,
 )
+from .file_tools import PERMITTED_FILE_TOOLS
 from .llm_provider import LLMProvider
 from .validation_prompts import ValidationPromptType
 from abc import ABC, abstractmethod
@@ -235,6 +236,86 @@ def _parse_llm_response(self, response_text: str, rubric: List[str]) -> List[Cri
         return criteria_results
 
 
+class ToolCallValidator(Validator):
+    """Validator that checks tool call ordering."""
+
+    def __init__(self, expected_tool_calls: List[List[str]], ignore_file_tools: bool = False):
+        """Initialize tool call validator.
+
+        Args:
+            expected_tool_calls: List of possible tool call sequences (list of lists).
+                                Only one sequence needs to match exactly.
+            ignore_file_tools: If True, filter out file-related tools before validation
+        """
+        self.expected_tool_calls = expected_tool_calls
+        self.ignore_file_tools = ignore_file_tools
+
+    def get_name(self) -> str:
+        """Return validator name."""
+        return 'Tool Call'
+
+    async def validate(
+        self,
+        captured_data: Dict[str, Any],
+    ) -> ValidationResult:
+        """Validate tool calls match one of the expected sequences."""
+        logger.info('Validating tool calls...')
+
+        tool_calls = captured_data.get(TOOL_CALLS, [])
+        called_tools = [call['name'] for call in tool_calls]
+
+        # Filter out file tools if requested
+        if self.ignore_file_tools:
+            called_tools = [tool for tool in called_tools if tool not in PERMITTED_FILE_TOOLS]
+
+        # Check if any expected sequence matches
+        matched_sequence = None
+        for expected_sequence in self.expected_tool_calls:
+            if called_tools == expected_sequence:
+                matched_sequence = expected_sequence
+                break
+
+        if matched_sequence is not None:
+            return {
+                'validator_name': self.get_name(),
+                'overall_pass': True,
+                'criteria_results': [
+                    {
+                        'criterion': 'Tools called in one of expected orders',
+                        'status': 'PASS',
+                        'reasoning': f'Matched sequence: {" → ".join(matched_sequence)}',
+                    }
+                ],
+                'raw_validation_output': {
+                    'expected_tool_calls': self.expected_tool_calls,
+                    'called_tools': called_tools,
+                    'matched_sequence': matched_sequence,
+                    'ignore_file_tools': self.ignore_file_tools,
+                },
+            }
+        else:
+            expected_sequences_str = ' OR '.join(
+                [f'[{" → ".join(seq)}]' for seq in self.expected_tool_calls]
+            )
+            return {
+                'validator_name': self.get_name(),
+                'overall_pass': False,
+                'criteria_results': [
+                    {
+                        'criterion': 'Tools called in one of expected orders',
+                        'status': 'FAIL',
+                        'reasoning': f'Expected one of: {expected_sequences_str}, got: [{" → ".join(called_tools)}]',
+                    }
+                ],
+                'raw_validation_output': {
+                    'expected_tool_calls': self.expected_tool_calls,
+                    'called_tools': called_tools,
+                    'matched_sequence': None,
+                    'ignore_file_tools': self.ignore_file_tools,
+                },
+            }
+
+
 class BuildValidator(Validator):
     """Validator that runs build commands and checks exit code."""