OpenHands · simonrosenberg · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
@@ -592,6 +592,7 @@ def main() -> None:
         max_attempts=args.max_attempts,
         critic=critic,
         selected_instances_file=args.select,
+        max_retries=args.max_retries,
         workspace_type=args.workspace,
         enable_delegation=args.enable_delegation,
     )

diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py
@@ -8,15 +8,16 @@
     --image ghcr.io/openhands/eval-agent-server --target source-minimal
 """
 
+import json
 import os
 from pathlib import Path
 
+from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
 from benchmarks.utils.build_utils import (
     build_all_images,
     default_build_output_dir,
     get_build_parser,
 )
-from benchmarks.utils.dataset import get_dataset
 from openhands.sdk import get_logger
 
 
@@ -37,7 +38,7 @@ def get_official_docker_image(
 
     # For Multi-SWE-Bench, the image naming depends on the language
     repo = instance["repo"]
-    version = instance["version"]
+    version = instance.get("version", "")
 
     if LANGUAGE == "python":
         # Use SWE-bench style naming for Python
@@ -52,7 +53,7 @@ def get_official_docker_image(
         else:
             org = instance.get("org", repo)
             repo_name = repo
-        official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base"
+        official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base".lower()
 
     logger.debug(f"Multi-SWE-Bench image: {official_image_name}")
     return official_image_name
@@ -79,12 +80,20 @@ def extract_custom_tag(base_image: str) -> str:
 
 def get_base_images_from_dataset(dataset_name: str, split: str) -> list[str]:
     """Get all unique base images from the dataset."""
-    dataset = get_dataset(dataset_name, split)
+    local_path = download_and_concat_dataset(dataset_name, LANGUAGE)
     base_images = set()
 
-    for _, row in dataset.iterrows():
-        image = get_official_docker_image(row.to_dict())
-        base_images.add(image)
+    with open(local_path, "r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            try:
+                instance = json.loads(line)
+            except json.JSONDecodeError as e:
+                logger.warning(f"Skipping malformed JSON line: {e}")
+                continue
+            image = get_official_docker_image(instance)
+            base_images.add(image)
 
     return list(base_images)
 
@@ -107,6 +116,7 @@ def main():
         build_dir=Path(
             args.output_dir or default_build_output_dir(args.dataset, args.split)
         ),
+        base_image_to_custom_tag_fn=extract_custom_tag,
         max_workers=args.num_workers,
         dry_run=False,
     )

diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py
@@ -95,11 +95,13 @@ def run_multi_swebench_evaluation(
             error_msg = f"Evaluation failed with return code {result.returncode}"
             print(f"ERROR: {error_msg}")
             logger.error(error_msg)
+            raise subprocess.CalledProcessError(result.returncode, cmd)
 
     except Exception as e:
         error_msg = f"Error running evaluation: {e}"
         print(f"ERROR: {error_msg}")
         logger.error(error_msg)
+        raise
 
 
 def main():
@@ -139,7 +141,7 @@ def main():
         logger.info(f"Results saved to {results_file}")
 
         # Move the report file to the output location
-        output_report_path = args.input_file.with_suffix(".report.json")
+        output_report_path = Path(args.input_file).with_suffix(".report.json")
         shutil.move(str(results_file), str(output_report_path))
         logger.info(f"Report moved to {output_report_path}")
 

diff --git a/benchmarks/openagentsafety/build_images.py b/benchmarks/openagentsafety/build_images.py
@@ -1,6 +1,7 @@
 """Build OpenAgentSafety Docker image from vendor/software-agent-sdk"""
 
 import logging
+import os
 import subprocess
 from pathlib import Path
 
@@ -31,6 +32,16 @@ def get_vendor_sdk_commit() -> str:
     return result.stdout.strip()
 
 
+def get_image_name() -> str:
+    image_name = os.getenv("EVAL_AGENT_SERVER_IMAGE", "openagentsafety-agent-server")
+    tag_prefix = os.getenv("IMAGE_TAG_PREFIX")
+    if tag_prefix:
+        tag = f"{tag_prefix}-openagentsafety"
+    else:
+        tag = get_vendor_sdk_commit()
+    return f"{image_name}:{tag}"
+
+
 def check_image_exists(image_name: str) -> bool:
     """Check if a Docker image exists locally."""
     result = subprocess.run(
@@ -48,13 +59,14 @@ def build_workspace_image(force_rebuild: bool = False, no_cache: bool = False) -
         force_rebuild: if True, ignore existing images and rebuild.
         no_cache: if True, pass --no-cache to docker build to avoid layer cache.
     """
-    sdk_commit = get_vendor_sdk_commit()
-    image_name = f"openagentsafety-agent-server:{sdk_commit}"
+    image_name = get_image_name()
 
     if not force_rebuild and check_image_exists(image_name):
         logger.info(f"#### Using existing image: {image_name}")
         return image_name
 
+    sdk_commit = get_vendor_sdk_commit()
+
     logger.info(f"#### Building Docker image: {image_name}")
     logger.info(f"#### SDK version: {sdk_commit}")
     logger.info("#### This will take approximately 3-5 minutes...")

diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
@@ -4,7 +4,6 @@
 import json
 import os
 import subprocess
-import tempfile
 import time
 from typing import Any, List
 
@@ -13,7 +12,11 @@
 import requests
 from jinja2 import Environment, FileSystemLoader
 
-from benchmarks.openagentsafety.build_images import build_workspace_image
+from benchmarks.openagentsafety.build_images import (
+    build_workspace_image,
+    check_image_exists,
+    get_image_name,
+)
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.console_logging import summarize_instance
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -42,12 +45,13 @@ def convert_numpy_types(obj: Any) -> Any:
         return float(obj)
     elif isinstance(obj, np.ndarray):
         return obj.tolist()
-    elif pd.isna(obj):
-        return None
     elif isinstance(obj, dict):
         return {k: convert_numpy_types(v) for k, v in obj.items()}
     elif isinstance(obj, list):
         return [convert_numpy_types(item) for item in obj]
+    # pd.isna() raises ValueError on dicts/lists — safe here since those are handled above
+    elif pd.isna(obj):
+        return None
     return obj
 
 
@@ -61,6 +65,10 @@ def default(self, o):
             return float(o)
         elif isinstance(o, np.ndarray):
             return o.tolist()
+        elif hasattr(o, "model_dump"):
+            return o.model_dump()
+        # JSONEncoder.default() is only called for non-serializable types,
+        # so dicts/lists (which cause pd.isna to raise) won't reach here.
         elif pd.isna(o):
             return None
         return super().default(o)
@@ -187,7 +195,7 @@ def cleanup_docker_containers():
                 "-a",
                 "-q",
                 "--filter",
-                "ancestor=openagentsafety-agent-server:local",
+                f"ancestor={get_image_name()}",
             ],
             capture_output=True,
             text=True,
@@ -235,39 +243,25 @@ def write_npc_config(
     }
 
     config_json = json.dumps(config, indent=2, cls=NumpyEncoder)
+    # NOTE: The heredoc approach is simpler than the previous tempfile+upload but
+    # embeds config content in the bash command string, which could appear in
+    # container logs or process listings. This is acceptable here because the
+    # config contains NPC scenario data (not secrets) — API keys are resolved
+    # separately via environment variables and never written to this file.
+    bash_command = f"""
+mkdir -p /npc
+cat > /npc/.npc_config.json << 'EOFNPC'
+{config_json}
+EOFNPC
+chmod 600 /npc/.npc_config.json
+"""
 
-    # Create /npc directory in container (doesn't leak sensitive info)
-    try:
-        workspace.execute_command("mkdir -p /npc", timeout=30)
-    except Exception as e:
-        logger.error(f"Failed to create /npc directory: {e}")
-        raise
-
-    # Write config to temporary file on host
-    temp_fd, temp_path = tempfile.mkstemp(suffix=".json", text=True)
     try:
-        with os.fdopen(temp_fd, "w") as f:
-            f.write(config_json)
-
-        # Upload file to container using file_upload (avoids bash command leak)
-        result = workspace.file_upload(
-            source_path=temp_path, destination_path="/npc/.npc_config.json"
-        )
-
-        if not result.success:
-            raise RuntimeError(f"File upload failed: {result}")
-
-        # Set restrictive permissions
-        workspace.execute_command("chmod 600 /npc/.npc_config.json", timeout=30)
-
-        logger.info("Wrote NPC config to /npc/.npc_config.json (via file_upload)")
+        workspace.execute_command(bash_command, timeout=60)
+        logger.info("Wrote NPC config to /npc/.npc_config.json")
     except Exception as e:
         logger.error(f"Failed to write NPC config: {e}")
         raise
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_path):
-            os.unlink(temp_path)
 
 
 def generate_instruction(instance_data: dict, template_path: str | None = None) -> str:
@@ -401,7 +395,18 @@ def prepare_workspace(
             resource_factor: Resource factor for runtime allocation (default: 1).
             forward_env: Environment variables to forward into the workspace.
         """
-        server_image = build_workspace_image()
+        # Try to build image on-the-fly, fall back to pre-built if build fails
+        try:
+            server_image = build_workspace_image()
+        except (subprocess.CalledProcessError, RuntimeError) as e:
+            logger.warning(f"On-the-fly build failed: {e}")
+            server_image = get_image_name()
+
+            if not check_image_exists(server_image):
+                raise RuntimeError(
+                    f"On-the-fly build failed and pre-built image {server_image} does not exist"
+                )
+            logger.info(f"Using pre-built image {server_image}")
 
         workspace = DockerWorkspace(
             server_image=server_image,
@@ -562,6 +567,77 @@ def event_callback(event) -> None:
         )
 
 
+def generate_report(output_jsonl: str, report_path: str, model_name: str) -> None:
+    """Generate a .report.json from the output.jsonl, matching the format
+    used by other benchmarks (SWE-Bench, GAIA, etc.).
+
+    Resolution logic mirrors eval_infer.py: an instance is "resolved" only
+    when ``final_score.result > 0`` and ``final_score.result == final_score.total``.
+    """
+    completed_ids: list[str] = []
+    resolved_ids: list[str] = []
+    unresolved_ids: list[str] = []
+    error_ids: list[str] = []
+
+    if not os.path.exists(output_jsonl):
+        logger.warning("No output.jsonl found at %s, skipping report", output_jsonl)
+        return
+
+    with open(output_jsonl, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            instance_id = data.get("instance_id", "")
+            error = data.get("error")
+            test_result = data.get("test_result", {})
+
+            if error or test_result.get("error"):
+                error_ids.append(instance_id)
+            else:
+                completed_ids.append(instance_id)
+                final_score = test_result.get("final_score", {})
+                result = final_score.get("result", 0)
+                total = final_score.get("total", 0)
+                if result > 0 and result == total:
+                    resolved_ids.append(instance_id)
+                else:
+                    unresolved_ids.append(instance_id)
+
+    submitted_ids = completed_ids + error_ids
+    report = {
+        "model_name_or_path": model_name,
+        "total_instances": len(submitted_ids),
+        "submitted_instances": len(submitted_ids),
+        "completed_instances": len(completed_ids),
+        "incomplete_instances": 0,
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "empty_patch_instances": 0,
+        "error_instances": len(error_ids),
+        "submitted_ids": submitted_ids,
+        "completed_ids": completed_ids,
+        "incomplete_ids": [],
+        "resolved_ids": resolved_ids,
+        "unresolved_ids": unresolved_ids,
+    }
+
+    with open(report_path, "w") as f:
+        json.dump(report, f, indent=4)
+
+    logger.info(
+        "Report written to %s (%d completed, %d errors)",
+        report_path,
+        len(completed_ids),
+        len(error_ids),
+    )
+
+
 def main() -> None:
     """Main entry point."""
     parser = get_parser(add_llm_config=True)
@@ -573,6 +649,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
+    # Load LLM config
     llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
@@ -600,13 +677,14 @@ def main() -> None:
         max_iterations=args.max_iterations,
         eval_output_dir=structured_output_dir,
         details={
-            "server_image": "openagentsafety-agent-server:local",
+            "server_image": get_image_name(),
             "platform": "linux/amd64",
         },
         eval_limit=args.n_limit,
         max_attempts=args.max_attempts,
         critic=critic,
         selected_instances_file=args.select,
+        max_retries=args.max_retries,
         enable_delegation=args.enable_delegation,
     )
 
@@ -663,6 +741,10 @@ def _cb(instance: EvalInstance, out: EvalOutput) -> None:
     # Run evaluation
     evaluator.run(on_result=_default_on_result_writer(metadata.eval_output_dir))
 
+    # Generate .report.json for nemo_evaluator compatibility
+    report_path = os.path.join(metadata.eval_output_dir, "output.report.json")
+    generate_report(evaluator.output_path, report_path, llm.model)
+
     # Final cleanup
     cleanup_docker_containers()