diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 6044479f6..a100a2cfb 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -592,6 +592,7 @@ def main() -> None: max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, + max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, ) diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py index 3ecdeeb67..6c3ef3b9f 100644 --- a/benchmarks/multiswebench/build_images.py +++ b/benchmarks/multiswebench/build_images.py @@ -8,15 +8,16 @@ --image ghcr.io/openhands/eval-agent-server --target source-minimal """ +import json import os from pathlib import Path +from benchmarks.multiswebench.download_dataset import download_and_concat_dataset from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, get_build_parser, ) -from benchmarks.utils.dataset import get_dataset from openhands.sdk import get_logger @@ -37,7 +38,7 @@ def get_official_docker_image( # For Multi-SWE-Bench, the image naming depends on the language repo = instance["repo"] - version = instance["version"] + version = instance.get("version", "") if LANGUAGE == "python": # Use SWE-bench style naming for Python @@ -52,7 +53,7 @@ def get_official_docker_image( else: org = instance.get("org", repo) repo_name = repo - official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base" + official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base".lower() logger.debug(f"Multi-SWE-Bench image: {official_image_name}") return official_image_name @@ -79,12 +80,20 @@ def extract_custom_tag(base_image: str) -> str: def get_base_images_from_dataset(dataset_name: str, split: str) -> list[str]: """Get all unique base images from the dataset.""" - dataset = get_dataset(dataset_name, split) + local_path = download_and_concat_dataset(dataset_name, LANGUAGE) base_images = set() - for _, row in dataset.iterrows(): - image = get_official_docker_image(row.to_dict()) - base_images.add(image) + with open(local_path, "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + try: + instance = json.loads(line) + except json.JSONDecodeError as e: + logger.warning(f"Skipping malformed JSON line: {e}") + continue + image = get_official_docker_image(instance) + base_images.add(image) return list(base_images) @@ -107,6 +116,7 @@ def main(): build_dir=Path( args.output_dir or default_build_output_dir(args.dataset, args.split) ), + base_image_to_custom_tag_fn=extract_custom_tag, max_workers=args.num_workers, dry_run=False, ) diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py index 43fc61bc5..0578f80e0 100644 --- a/benchmarks/multiswebench/eval_infer.py +++ b/benchmarks/multiswebench/eval_infer.py @@ -95,11 +95,13 @@ def run_multi_swebench_evaluation( error_msg = f"Evaluation failed with return code {result.returncode}" print(f"ERROR: {error_msg}") logger.error(error_msg) + raise subprocess.CalledProcessError(result.returncode, cmd) except Exception as e: error_msg = f"Error running evaluation: {e}" print(f"ERROR: {error_msg}") logger.error(error_msg) + raise def main(): @@ -139,7 +141,7 @@ def main(): logger.info(f"Results saved to {results_file}") # Move the report file to the output location - output_report_path = args.input_file.with_suffix(".report.json") + output_report_path = Path(args.input_file).with_suffix(".report.json") shutil.move(str(results_file), str(output_report_path)) logger.info(f"Report moved to {output_report_path}") diff --git a/benchmarks/openagentsafety/build_images.py b/benchmarks/openagentsafety/build_images.py index acb183844..29a39e478 100644 --- a/benchmarks/openagentsafety/build_images.py +++ b/benchmarks/openagentsafety/build_images.py @@ -1,6 +1,7 @@ """Build OpenAgentSafety Docker image from vendor/software-agent-sdk""" import logging +import os import subprocess from pathlib import Path @@ -31,6 +32,16 @@ def get_vendor_sdk_commit() -> str: return result.stdout.strip() +def get_image_name() -> str: + image_name = os.getenv("EVAL_AGENT_SERVER_IMAGE", "openagentsafety-agent-server") + tag_prefix = os.getenv("IMAGE_TAG_PREFIX") + if tag_prefix: + tag = f"{tag_prefix}-openagentsafety" + else: + tag = get_vendor_sdk_commit() + return f"{image_name}:{tag}" + + def check_image_exists(image_name: str) -> bool: """Check if a Docker image exists locally.""" result = subprocess.run( @@ -48,13 +59,14 @@ def build_workspace_image(force_rebuild: bool = False, no_cache: bool = False) - force_rebuild: if True, ignore existing images and rebuild. no_cache: if True, pass --no-cache to docker build to avoid layer cache. """ - sdk_commit = get_vendor_sdk_commit() - image_name = f"openagentsafety-agent-server:{sdk_commit}" + image_name = get_image_name() if not force_rebuild and check_image_exists(image_name): logger.info(f"#### Using existing image: {image_name}") return image_name + sdk_commit = get_vendor_sdk_commit() + logger.info(f"#### Building Docker image: {image_name}") logger.info(f"#### SDK version: {sdk_commit}") logger.info("#### This will take approximately 3-5 minutes...") diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index 3691e6a96..3c55b5310 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -4,7 +4,6 @@ import json import os import subprocess -import tempfile import time from typing import Any, List @@ -13,7 +12,11 @@ import requests from jinja2 import Environment, FileSystemLoader -from benchmarks.openagentsafety.build_images import build_workspace_image +from benchmarks.openagentsafety.build_images import ( + build_workspace_image, + check_image_exists, + get_image_name, +) from benchmarks.utils.args_parser import get_parser from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.conversation import build_event_persistence_callback @@ -42,12 +45,13 @@ def convert_numpy_types(obj: Any) -> Any: return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() - elif pd.isna(obj): - return None elif isinstance(obj, dict): return {k: convert_numpy_types(v) for k, v in obj.items()} elif isinstance(obj, list): return [convert_numpy_types(item) for item in obj] + # pd.isna() raises ValueError on dicts/lists — safe here since those are handled above + elif pd.isna(obj): + return None return obj @@ -61,6 +65,10 @@ def default(self, o): return float(o) elif isinstance(o, np.ndarray): return o.tolist() + elif hasattr(o, "model_dump"): + return o.model_dump() + # JSONEncoder.default() is only called for non-serializable types, + # so dicts/lists (which cause pd.isna to raise) won't reach here. elif pd.isna(o): return None return super().default(o) @@ -187,7 +195,7 @@ def cleanup_docker_containers(): "-a", "-q", "--filter", - "ancestor=openagentsafety-agent-server:local", + f"ancestor={get_image_name()}", ], capture_output=True, text=True, @@ -235,39 +243,25 @@ def write_npc_config( } config_json = json.dumps(config, indent=2, cls=NumpyEncoder) + # NOTE: The heredoc approach is simpler than the previous tempfile+upload but + # embeds config content in the bash command string, which could appear in + # container logs or process listings. This is acceptable here because the + # config contains NPC scenario data (not secrets) — API keys are resolved + # separately via environment variables and never written to this file. + bash_command = f""" +mkdir -p /npc +cat > /npc/.npc_config.json << 'EOFNPC' +{config_json} +EOFNPC +chmod 600 /npc/.npc_config.json +""" - # Create /npc directory in container (doesn't leak sensitive info) - try: - workspace.execute_command("mkdir -p /npc", timeout=30) - except Exception as e: - logger.error(f"Failed to create /npc directory: {e}") - raise - - # Write config to temporary file on host - temp_fd, temp_path = tempfile.mkstemp(suffix=".json", text=True) try: - with os.fdopen(temp_fd, "w") as f: - f.write(config_json) - - # Upload file to container using file_upload (avoids bash command leak) - result = workspace.file_upload( - source_path=temp_path, destination_path="/npc/.npc_config.json" - ) - - if not result.success: - raise RuntimeError(f"File upload failed: {result}") - - # Set restrictive permissions - workspace.execute_command("chmod 600 /npc/.npc_config.json", timeout=30) - - logger.info("Wrote NPC config to /npc/.npc_config.json (via file_upload)") + workspace.execute_command(bash_command, timeout=60) + logger.info("Wrote NPC config to /npc/.npc_config.json") except Exception as e: logger.error(f"Failed to write NPC config: {e}") raise - finally: - # Clean up temporary file - if os.path.exists(temp_path): - os.unlink(temp_path) def generate_instruction(instance_data: dict, template_path: str | None = None) -> str: @@ -401,7 +395,18 @@ def prepare_workspace( resource_factor: Resource factor for runtime allocation (default: 1). forward_env: Environment variables to forward into the workspace. """ - server_image = build_workspace_image() + # Try to build image on-the-fly, fall back to pre-built if build fails + try: + server_image = build_workspace_image() + except (subprocess.CalledProcessError, RuntimeError) as e: + logger.warning(f"On-the-fly build failed: {e}") + server_image = get_image_name() + + if not check_image_exists(server_image): + raise RuntimeError( + f"On-the-fly build failed and pre-built image {server_image} does not exist" + ) + logger.info(f"Using pre-built image {server_image}") workspace = DockerWorkspace( server_image=server_image, @@ -562,6 +567,77 @@ def event_callback(event) -> None: ) +def generate_report(output_jsonl: str, report_path: str, model_name: str) -> None: + """Generate a .report.json from the output.jsonl, matching the format + used by other benchmarks (SWE-Bench, GAIA, etc.). + + Resolution logic mirrors eval_infer.py: an instance is "resolved" only + when ``final_score.result > 0`` and ``final_score.result == final_score.total``. + """ + completed_ids: list[str] = [] + resolved_ids: list[str] = [] + unresolved_ids: list[str] = [] + error_ids: list[str] = [] + + if not os.path.exists(output_jsonl): + logger.warning("No output.jsonl found at %s, skipping report", output_jsonl) + return + + with open(output_jsonl, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + except json.JSONDecodeError: + continue + + instance_id = data.get("instance_id", "") + error = data.get("error") + test_result = data.get("test_result", {}) + + if error or test_result.get("error"): + error_ids.append(instance_id) + else: + completed_ids.append(instance_id) + final_score = test_result.get("final_score", {}) + result = final_score.get("result", 0) + total = final_score.get("total", 0) + if result > 0 and result == total: + resolved_ids.append(instance_id) + else: + unresolved_ids.append(instance_id) + + submitted_ids = completed_ids + error_ids + report = { + "model_name_or_path": model_name, + "total_instances": len(submitted_ids), + "submitted_instances": len(submitted_ids), + "completed_instances": len(completed_ids), + "incomplete_instances": 0, + "resolved_instances": len(resolved_ids), + "unresolved_instances": len(unresolved_ids), + "empty_patch_instances": 0, + "error_instances": len(error_ids), + "submitted_ids": submitted_ids, + "completed_ids": completed_ids, + "incomplete_ids": [], + "resolved_ids": resolved_ids, + "unresolved_ids": unresolved_ids, + } + + with open(report_path, "w") as f: + json.dump(report, f, indent=4) + + logger.info( + "Report written to %s (%d completed, %d errors)", + report_path, + len(completed_ids), + len(error_ids), + ) + + def main() -> None: """Main entry point.""" parser = get_parser(add_llm_config=True) @@ -573,6 +649,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Load LLM config llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) @@ -600,13 +677,14 @@ def main() -> None: max_iterations=args.max_iterations, eval_output_dir=structured_output_dir, details={ - "server_image": "openagentsafety-agent-server:local", + "server_image": get_image_name(), "platform": "linux/amd64", }, eval_limit=args.n_limit, max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, + max_retries=args.max_retries, enable_delegation=args.enable_delegation, ) @@ -663,6 +741,10 @@ def _cb(instance: EvalInstance, out: EvalOutput) -> None: # Run evaluation evaluator.run(on_result=_default_on_result_writer(metadata.eval_output_dir)) + # Generate .report.json for nemo_evaluator compatibility + report_path = os.path.join(metadata.eval_output_dir, "output.report.json") + generate_report(evaluator.output_path, report_path, llm.model) + # Final cleanup cleanup_docker_containers() diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index ab52b26ec..379d718da 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -117,11 +117,11 @@ def convert_to_swebench_format(input_file: str, output_file: str) -> None: def run_swebench_evaluation( predictions_file: str, run_id: str, - dataset: str, - workers: int, - split: str, - modal: bool, - timeout: int, + dataset: str = EVAL_DEFAULTS["dataset"], + workers: int = EVAL_DEFAULTS["workers"], + split: str = EVAL_DEFAULTS["split"], + modal: bool = EVAL_DEFAULTS["modal"], + timeout: int = EVAL_DEFAULTS["timeout"], ) -> None: """ Run SWE-Bench evaluation on the predictions file. diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py index b3adec48b..737213680 100644 --- a/benchmarks/swebenchmultimodal/eval_infer.py +++ b/benchmarks/swebenchmultimodal/eval_infer.py @@ -257,6 +257,7 @@ def run_swebench_multimodal_evaluation( split: Dataset split to use (default: dev) workers: Number of workers to use for evaluation run_id: Optional run ID for the evaluation + modal: Whether to use Modal for evaluation (default: True) Returns: Path to the generated report.json file, or None if not found diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index 57ee83506..bdfb7b13e 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -25,7 +25,6 @@ EvalOutput, ) from benchmarks.utils.version import IMAGE_TAG_PREFIX -from openhands.agent_server.docker.build import _base_slug from openhands.sdk import Agent, Conversation, Tool, __version__, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool @@ -55,6 +54,10 @@ def get_agent_server_docker_image( target: str = "source-minimal", ) -> str: """Get the agent server Docker image for an instance.""" + # Importing here because openhands.agent_server.docker.build runs git checks + # which fails when installed as a package outside the git repo + from openhands.agent_server.docker.build import _base_slug + official_image_name = get_official_docker_image(instance_id, docker_image_prefix) return ( "ghcr.io/all-hands-ai/agent-server" diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index f61110729..b4e56b9ad 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -20,6 +20,7 @@ from pydantic import BaseModel, Field from tqdm.auto import tqdm +from benchmarks.swebench.constants import TargetType from benchmarks.utils.args_parser import get_parser from benchmarks.utils.buildx_utils import ( buildkit_disk_usage, @@ -28,7 +29,6 @@ ) from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.image_utils import local_image_exists, remote_image_exists -from openhands.agent_server.docker.build import BuildOptions, TargetType, build from openhands.sdk import get_logger @@ -282,6 +282,10 @@ def build_image( target: TargetType = "source-minimal", push: bool = False, ) -> BuildOutput: + # Importing here because openhands.agent_server.docker.build runs git checks + # which fails when installed as a package outside the git repo + from openhands.agent_server.docker.build import BuildOptions, build + # Get SDK info from submodule to ensure tags use the correct SDK SHA git_ref, git_sha, sdk_version = _get_sdk_submodule_info() diff --git a/tests/test_openagentsafety_report.py b/tests/test_openagentsafety_report.py new file mode 100644 index 000000000..f21741548 --- /dev/null +++ b/tests/test_openagentsafety_report.py @@ -0,0 +1,127 @@ +"""Tests for openagentsafety generate_report functionality.""" + +import json +import tempfile +from pathlib import Path + +from benchmarks.openagentsafety.run_infer import generate_report + + +def test_mixed_results(): + """Resolved, unresolved, and error instances are classified correctly.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_jsonl = Path(tmpdir) / "output.jsonl" + report_path = Path(tmpdir) / "output.report.json" + + lines = [ + # resolved: result == total > 0 + { + "instance_id": "resolved-1", + "test_result": {"final_score": {"result": 3, "total": 3}}, + }, + # unresolved: result < total + { + "instance_id": "unresolved-1", + "test_result": {"final_score": {"result": 1, "total": 3}}, + }, + # unresolved: result == 0 + { + "instance_id": "unresolved-2", + "test_result": {"final_score": {"result": 0, "total": 2}}, + }, + # error at top level + { + "instance_id": "error-1", + "error": "timeout", + "test_result": {}, + }, + # error inside test_result + { + "instance_id": "error-2", + "test_result": {"error": "evaluation crashed"}, + }, + ] + output_jsonl.write_text("\n".join(json.dumps(entry) for entry in lines) + "\n") + + generate_report(str(output_jsonl), str(report_path), "test-model") + + report = json.loads(report_path.read_text()) + + assert report["model_name_or_path"] == "test-model" + assert report["resolved_instances"] == 1 + assert report["resolved_ids"] == ["resolved-1"] + assert report["unresolved_instances"] == 2 + assert sorted(report["unresolved_ids"]) == ["unresolved-1", "unresolved-2"] + assert report["error_instances"] == 2 + assert report["completed_instances"] == 3 + assert report["submitted_instances"] == 5 + assert report["total_instances"] == 5 + + +def test_empty_file(): + """An empty output.jsonl produces a report with all zeroes.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_jsonl = Path(tmpdir) / "output.jsonl" + report_path = Path(tmpdir) / "output.report.json" + output_jsonl.write_text("") + + generate_report(str(output_jsonl), str(report_path), "test-model") + + report = json.loads(report_path.read_text()) + assert report["total_instances"] == 0 + assert report["resolved_instances"] == 0 + assert report["unresolved_instances"] == 0 + assert report["error_instances"] == 0 + + +def test_missing_file(): + """A missing output.jsonl produces no report file.""" + with tempfile.TemporaryDirectory() as tmpdir: + report_path = Path(tmpdir) / "output.report.json" + generate_report("/nonexistent/output.jsonl", str(report_path), "m") + assert not report_path.exists() + + +def test_malformed_json_lines_skipped(): + """Malformed JSON lines are silently skipped.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_jsonl = Path(tmpdir) / "output.jsonl" + report_path = Path(tmpdir) / "output.report.json" + + content = ( + "not valid json\n" + + json.dumps( + { + "instance_id": "good-1", + "test_result": {"final_score": {"result": 1, "total": 1}}, + } + ) + + "\n" + ) + output_jsonl.write_text(content) + + generate_report(str(output_jsonl), str(report_path), "test-model") + + report = json.loads(report_path.read_text()) + assert report["resolved_instances"] == 1 + assert report["resolved_ids"] == ["good-1"] + assert report["total_instances"] == 1 + + +def test_missing_final_score_is_unresolved(): + """An instance with no final_score is completed but unresolved.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_jsonl = Path(tmpdir) / "output.jsonl" + report_path = Path(tmpdir) / "output.report.json" + + output_jsonl.write_text( + json.dumps({"instance_id": "no-score", "test_result": {}}) + "\n" + ) + + generate_report(str(output_jsonl), str(report_path), "test-model") + + report = json.loads(report_path.read_text()) + assert report["completed_instances"] == 1 + assert report["resolved_instances"] == 0 + assert report["unresolved_instances"] == 1 + assert report["unresolved_ids"] == ["no-score"]