From a878d5c6d5335d27c6951c7b0bfbb1fc23f4f4e4 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 3 Mar 2026 11:51:02 -0300
Subject: [PATCH 1/8] Fix benchmark bugs: error handling, image naming, lazy
 imports, and more

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/gaia/run_infer.py                |   1 +
 benchmarks/multiswebench/build_images.py    |  20 ++-
 benchmarks/multiswebench/eval_infer.py      |   4 +-
 benchmarks/openagentsafety/build_images.py  |  16 ++-
 benchmarks/openagentsafety/run_infer.py     | 146 +++++++++++++++-----
 benchmarks/swebench/eval_infer.py           |  15 +-
 benchmarks/swebenchmultimodal/eval_infer.py |   1 +
 benchmarks/swtbench/run_infer.py            |   7 +-
 benchmarks/utils/build_utils.py             |   6 +-
 9 files changed, 159 insertions(+), 57 deletions(-)

diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index 6044479f6..a100a2cfb 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -592,6 +592,7 @@ def main() -> None:
         max_attempts=args.max_attempts,
         critic=critic,
         selected_instances_file=args.select,
+        max_retries=args.max_retries,
         workspace_type=args.workspace,
         enable_delegation=args.enable_delegation,
     )
diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py
index 3ecdeeb67..86894e04d 100644
--- a/benchmarks/multiswebench/build_images.py
+++ b/benchmarks/multiswebench/build_images.py
@@ -8,15 +8,16 @@
     --image ghcr.io/openhands/eval-agent-server --target source-minimal
 """
 
+import json
 import os
 from pathlib import Path
 
+from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
 from benchmarks.utils.build_utils import (
     build_all_images,
     default_build_output_dir,
     get_build_parser,
 )
-from benchmarks.utils.dataset import get_dataset
 from openhands.sdk import get_logger
 
 
@@ -37,7 +38,7 @@ def get_official_docker_image(
 
     # For Multi-SWE-Bench, the image naming depends on the language
     repo = instance["repo"]
-    version = instance["version"]
+    version = instance.get("version", "")
 
     if LANGUAGE == "python":
         # Use SWE-bench style naming for Python
@@ -52,7 +53,7 @@ def get_official_docker_image(
         else:
             org = instance.get("org", repo)
             repo_name = repo
-        official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base"
+        official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base".lower()
 
     logger.debug(f"Multi-SWE-Bench image: {official_image_name}")
     return official_image_name
@@ -79,12 +80,16 @@ def extract_custom_tag(base_image: str) -> str:
 
 def get_base_images_from_dataset(dataset_name: str, split: str) -> list[str]:
     """Get all unique base images from the dataset."""
-    dataset = get_dataset(dataset_name, split)
+    local_path = download_and_concat_dataset(dataset_name, LANGUAGE)
     base_images = set()
 
-    for _, row in dataset.iterrows():
-        image = get_official_docker_image(row.to_dict())
-        base_images.add(image)
+    with open(local_path, "r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            instance = json.loads(line)
+            image = get_official_docker_image(instance)
+            base_images.add(image)
 
     return list(base_images)
 
@@ -107,6 +112,7 @@ def main():
         build_dir=Path(
             args.output_dir or default_build_output_dir(args.dataset, args.split)
         ),
+        base_image_to_custom_tag_fn=extract_custom_tag,
         max_workers=args.num_workers,
         dry_run=False,
     )
diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py
index 43fc61bc5..0578f80e0 100644
--- a/benchmarks/multiswebench/eval_infer.py
+++ b/benchmarks/multiswebench/eval_infer.py
@@ -95,11 +95,13 @@ def run_multi_swebench_evaluation(
             error_msg = f"Evaluation failed with return code {result.returncode}"
             print(f"ERROR: {error_msg}")
             logger.error(error_msg)
+            raise subprocess.CalledProcessError(result.returncode, cmd)
 
     except Exception as e:
         error_msg = f"Error running evaluation: {e}"
         print(f"ERROR: {error_msg}")
         logger.error(error_msg)
+        raise
 
 
 def main():
@@ -139,7 +141,7 @@ def main():
         logger.info(f"Results saved to {results_file}")
 
         # Move the report file to the output location
-        output_report_path = args.input_file.with_suffix(".report.json")
+        output_report_path = Path(args.input_file).with_suffix(".report.json")
         shutil.move(str(results_file), str(output_report_path))
         logger.info(f"Report moved to {output_report_path}")
 
diff --git a/benchmarks/openagentsafety/build_images.py b/benchmarks/openagentsafety/build_images.py
index acb183844..29a39e478 100644
--- a/benchmarks/openagentsafety/build_images.py
+++ b/benchmarks/openagentsafety/build_images.py
@@ -1,6 +1,7 @@
 """Build OpenAgentSafety Docker image from vendor/software-agent-sdk"""
 
 import logging
+import os
 import subprocess
 from pathlib import Path
 
@@ -31,6 +32,16 @@ def get_vendor_sdk_commit() -> str:
     return result.stdout.strip()
 
 
+def get_image_name() -> str:
+    image_name = os.getenv("EVAL_AGENT_SERVER_IMAGE", "openagentsafety-agent-server")
+    tag_prefix = os.getenv("IMAGE_TAG_PREFIX")
+    if tag_prefix:
+        tag = f"{tag_prefix}-openagentsafety"
+    else:
+        tag = get_vendor_sdk_commit()
+    return f"{image_name}:{tag}"
+
+
 def check_image_exists(image_name: str) -> bool:
     """Check if a Docker image exists locally."""
     result = subprocess.run(
@@ -48,13 +59,14 @@ def build_workspace_image(force_rebuild: bool = False, no_cache: bool = False) -
         force_rebuild: if True, ignore existing images and rebuild.
         no_cache: if True, pass --no-cache to docker build to avoid layer cache.
     """
-    sdk_commit = get_vendor_sdk_commit()
-    image_name = f"openagentsafety-agent-server:{sdk_commit}"
+    image_name = get_image_name()
 
     if not force_rebuild and check_image_exists(image_name):
         logger.info(f"#### Using existing image: {image_name}")
         return image_name
 
+    sdk_commit = get_vendor_sdk_commit()
+
     logger.info(f"#### Building Docker image: {image_name}")
     logger.info(f"#### SDK version: {sdk_commit}")
     logger.info("#### This will take approximately 3-5 minutes...")
diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
index 3691e6a96..54d9e5a9c 100644
--- a/benchmarks/openagentsafety/run_infer.py
+++ b/benchmarks/openagentsafety/run_infer.py
@@ -4,7 +4,6 @@
 import json
 import os
 import subprocess
-import tempfile
 import time
 from typing import Any, List
 
@@ -13,7 +12,11 @@
 import requests
 from jinja2 import Environment, FileSystemLoader
 
-from benchmarks.openagentsafety.build_images import build_workspace_image
+from benchmarks.openagentsafety.build_images import (
+    build_workspace_image,
+    check_image_exists,
+    get_image_name,
+)
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.console_logging import summarize_instance
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -42,12 +45,16 @@ def convert_numpy_types(obj: Any) -> Any:
         return float(obj)
     elif isinstance(obj, np.ndarray):
         return obj.tolist()
-    elif pd.isna(obj):
-        return None
     elif isinstance(obj, dict):
         return {k: convert_numpy_types(v) for k, v in obj.items()}
     elif isinstance(obj, list):
         return [convert_numpy_types(item) for item in obj]
+    else:
+        try:
+            if pd.isna(obj):
+                return None
+        except (ValueError, TypeError):
+            pass
     return obj
 
 
@@ -61,8 +68,13 @@ def default(self, o):
             return float(o)
         elif isinstance(o, np.ndarray):
             return o.tolist()
-        elif pd.isna(o):
-            return None
+        elif hasattr(o, "model_dump"):
+            return o.model_dump()
+        try:
+            if pd.isna(o):
+                return None
+        except (ValueError, TypeError):
+            pass
         return super().default(o)
 
 
@@ -187,7 +199,7 @@ def cleanup_docker_containers():
                 "-a",
                 "-q",
                 "--filter",
-                "ancestor=openagentsafety-agent-server:local",
+                f"ancestor={get_image_name()}",
             ],
             capture_output=True,
             text=True,
@@ -235,39 +247,20 @@ def write_npc_config(
     }
 
     config_json = json.dumps(config, indent=2, cls=NumpyEncoder)
+    bash_command = f"""
+mkdir -p /npc
+cat > /npc/.npc_config.json << 'EOFNPC'
+{config_json}
+EOFNPC
+chmod 600 /npc/.npc_config.json
+"""
 
-    # Create /npc directory in container (doesn't leak sensitive info)
-    try:
-        workspace.execute_command("mkdir -p /npc", timeout=30)
-    except Exception as e:
-        logger.error(f"Failed to create /npc directory: {e}")
-        raise
-
-    # Write config to temporary file on host
-    temp_fd, temp_path = tempfile.mkstemp(suffix=".json", text=True)
     try:
-        with os.fdopen(temp_fd, "w") as f:
-            f.write(config_json)
-
-        # Upload file to container using file_upload (avoids bash command leak)
-        result = workspace.file_upload(
-            source_path=temp_path, destination_path="/npc/.npc_config.json"
-        )
-
-        if not result.success:
-            raise RuntimeError(f"File upload failed: {result}")
-
-        # Set restrictive permissions
-        workspace.execute_command("chmod 600 /npc/.npc_config.json", timeout=30)
-
-        logger.info("Wrote NPC config to /npc/.npc_config.json (via file_upload)")
+        workspace.execute_command(bash_command, timeout=60)
+        logger.info("Wrote NPC config to /npc/.npc_config.json")
     except Exception as e:
         logger.error(f"Failed to write NPC config: {e}")
         raise
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_path):
-            os.unlink(temp_path)
 
 
 def generate_instruction(instance_data: dict, template_path: str | None = None) -> str:
@@ -401,7 +394,18 @@ def prepare_workspace(
             resource_factor: Resource factor for runtime allocation (default: 1).
             forward_env: Environment variables to forward into the workspace.
         """
-        server_image = build_workspace_image()
+        # Try to build image on-the-fly, fall back to pre-built if build fails
+        try:
+            server_image = build_workspace_image()
+        except (subprocess.CalledProcessError, RuntimeError) as e:
+            logger.warning(f"On-the-fly build failed: {e}")
+            server_image = get_image_name()
+
+            if not check_image_exists(server_image):
+                raise RuntimeError(
+                    f"On-the-fly build failed and pre-built image {server_image} does not exist"
+                )
+            logger.info(f"Using pre-built image {server_image}")
 
         workspace = DockerWorkspace(
             server_image=server_image,
@@ -562,6 +566,68 @@ def event_callback(event) -> None:
         )
 
 
+def generate_report(output_jsonl: str, report_path: str, model_name: str) -> None:
+    """Generate a .report.json from the output.jsonl, matching the format
+    expected by nemo_evaluator (same schema as SWE-Bench / GAIA reports)."""
+    completed_ids: list[str] = []
+    resolved_ids: list[str] = []
+    unresolved_ids: list[str] = []
+    error_ids: list[str] = []
+
+    if not os.path.exists(output_jsonl):
+        logger.warning("No output.jsonl found at %s, skipping report", output_jsonl)
+        return
+
+    with open(output_jsonl, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            instance_id = data.get("instance_id", "")
+            error = data.get("error")
+            test_result = data.get("test_result", {})
+
+            if error or test_result.get("error"):
+                error_ids.append(instance_id)
+            else:
+                completed_ids.append(instance_id)
+                # Treat as resolved when there is no error
+                resolved_ids.append(instance_id)
+
+    submitted_ids = completed_ids + error_ids
+    report = {
+        "model_name_or_path": model_name,
+        "total_instances": len(submitted_ids),
+        "submitted_instances": len(submitted_ids),
+        "completed_instances": len(completed_ids),
+        "incomplete_instances": 0,
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "empty_patch_instances": 0,
+        "error_instances": len(error_ids),
+        "submitted_ids": submitted_ids,
+        "completed_ids": completed_ids,
+        "incomplete_ids": [],
+        "resolved_ids": resolved_ids,
+        "unresolved_ids": unresolved_ids,
+    }
+
+    with open(report_path, "w") as f:
+        json.dump(report, f, indent=4)
+
+    logger.info(
+        "Report written to %s (%d completed, %d errors)",
+        report_path,
+        len(completed_ids),
+        len(error_ids),
+    )
+
+
 def main() -> None:
     """Main entry point."""
     parser = get_parser(add_llm_config=True)
@@ -573,6 +639,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
+    # Load LLM config
     llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
@@ -600,13 +667,14 @@ def main() -> None:
         max_iterations=args.max_iterations,
         eval_output_dir=structured_output_dir,
         details={
-            "server_image": "openagentsafety-agent-server:local",
+            "server_image": get_image_name(),
             "platform": "linux/amd64",
         },
         eval_limit=args.n_limit,
         max_attempts=args.max_attempts,
         critic=critic,
         selected_instances_file=args.select,
+        max_retries=args.max_retries,
         enable_delegation=args.enable_delegation,
     )
 
@@ -663,6 +731,10 @@ def _cb(instance: EvalInstance, out: EvalOutput) -> None:
     # Run evaluation
     evaluator.run(on_result=_default_on_result_writer(metadata.eval_output_dir))
 
+    # Generate .report.json for nemo_evaluator compatibility
+    report_path = os.path.join(metadata.eval_output_dir, "output.report.json")
+    generate_report(evaluator.output_path, report_path, llm.model)
+
     # Final cleanup
     cleanup_docker_containers()
 
diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index ab52b26ec..d91864d13 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -117,11 +117,11 @@ def convert_to_swebench_format(input_file: str, output_file: str) -> None:
 def run_swebench_evaluation(
     predictions_file: str,
     run_id: str,
-    dataset: str,
-    workers: int,
-    split: str,
-    modal: bool,
-    timeout: int,
+    dataset: str = EVAL_DEFAULTS["dataset"],
+    workers: int = EVAL_DEFAULTS["workers"],
+    split: str = EVAL_DEFAULTS["split"],
+    modal: bool = EVAL_DEFAULTS["modal"],
+    timeout: int = EVAL_DEFAULTS["timeout"],
 ) -> None:
     """
     Run SWE-Bench evaluation on the predictions file.
@@ -257,10 +257,11 @@ def main() -> None:
     parser.add_argument(
         "--timeout",
         type=int,
-        help="Timeout in seconds for evaluation",
+        default=EVAL_DEFAULTS["timeout"],
+        help=f"Timeout in seconds for evaluation (default: {EVAL_DEFAULTS['timeout']})",
     )
 
-    # Apply EVAL_DEFAULTS from config (for dataset, split, workers, modal, timeout)
+    # Apply EVAL_DEFAULTS from config (for dataset, split, workers)
     parser.set_defaults(**EVAL_DEFAULTS)
 
     args = parser.parse_args()
diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py
index b3adec48b..737213680 100644
--- a/benchmarks/swebenchmultimodal/eval_infer.py
+++ b/benchmarks/swebenchmultimodal/eval_infer.py
@@ -257,6 +257,7 @@ def run_swebench_multimodal_evaluation(
         split: Dataset split to use (default: dev)
         workers: Number of workers to use for evaluation
         run_id: Optional run ID for the evaluation
+        modal: Whether to use Modal for evaluation (default: True)
 
     Returns:
         Path to the generated report.json file, or None if not found
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index 57ee83506..0b72ae352 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -25,7 +25,6 @@
     EvalOutput,
 )
 from benchmarks.utils.version import IMAGE_TAG_PREFIX
-from openhands.agent_server.docker.build import _base_slug
 from openhands.sdk import Agent, Conversation, Tool, __version__, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.delegate import DelegateTool
@@ -55,6 +54,10 @@ def get_agent_server_docker_image(
     target: str = "source-minimal",
 ) -> str:
     """Get the agent server Docker image for an instance."""
+    # Importing here because openhands.agent_server.docker.build runs git checks
+    # which fails when installed as a package outside the git repo
+    from openhands.agent_server.docker.build import _base_slug
+
     official_image_name = get_official_docker_image(instance_id, docker_image_prefix)
     return (
         "ghcr.io/all-hands-ai/agent-server"
@@ -357,7 +360,7 @@ def main() -> None:
         dataset_name=dataset_description,
         model_name=llm.model,
         max_iterations=args.max_iterations,
-        eval_note=f"SWT-{args.note}" if args.note else None,
+        eval_note="SWT-" + args.note,
     )
 
     critic = create_critic(args)
diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
index f61110729..b4e56b9ad 100644
--- a/benchmarks/utils/build_utils.py
+++ b/benchmarks/utils/build_utils.py
@@ -20,6 +20,7 @@
 from pydantic import BaseModel, Field
 from tqdm.auto import tqdm
 
+from benchmarks.swebench.constants import TargetType
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.buildx_utils import (
     buildkit_disk_usage,
@@ -28,7 +29,6 @@
 )
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.image_utils import local_image_exists, remote_image_exists
-from openhands.agent_server.docker.build import BuildOptions, TargetType, build
 from openhands.sdk import get_logger
 
 
@@ -282,6 +282,10 @@ def build_image(
     target: TargetType = "source-minimal",
     push: bool = False,
 ) -> BuildOutput:
+    # Importing here because openhands.agent_server.docker.build runs git checks
+    # which fails when installed as a package outside the git repo
+    from openhands.agent_server.docker.build import BuildOptions, build
+
     # Get SDK info from submodule to ensure tags use the correct SDK SHA
     git_ref, git_sha, sdk_version = _get_sdk_submodule_info()
 

From d35321e3c39d952e4832dad4a056d766c2440714 Mon Sep 17 00:00:00 2001
From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com>
Date: Tue, 3 Mar 2026 11:57:12 -0300
Subject: [PATCH 2/8] Update benchmarks/swtbench/run_infer.py

Co-authored-by: OpenHands Bot <contact@all-hands.dev>
---
 benchmarks/swtbench/run_infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index 0b72ae352..6394f1dca 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -360,7 +360,7 @@ def main() -> None:
         dataset_name=dataset_description,
         model_name=llm.model,
         max_iterations=args.max_iterations,
-        eval_note="SWT-" + args.note,
+        eval_note="SWT-" + args.note if args.note else None,
     )
 
     critic = create_critic(args)

From e60e6f30aee2fbcc55eca90683bea6f6217c8e7b Mon Sep 17 00:00:00 2001
From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com>
Date: Tue, 3 Mar 2026 11:58:14 -0300
Subject: [PATCH 3/8] Update benchmarks/multiswebench/build_images.py

Co-authored-by: OpenHands Bot <contact@all-hands.dev>
---
 benchmarks/multiswebench/build_images.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py
index 86894e04d..6c3ef3b9f 100644
--- a/benchmarks/multiswebench/build_images.py
+++ b/benchmarks/multiswebench/build_images.py
@@ -87,7 +87,11 @@ def get_base_images_from_dataset(dataset_name: str, split: str) -> list[str]:
         for line in f:
             if not line.strip():
                 continue
-            instance = json.loads(line)
+            try:
+                instance = json.loads(line)
+            except json.JSONDecodeError as e:
+                logger.warning(f"Skipping malformed JSON line: {e}")
+                continue
             image = get_official_docker_image(instance)
             base_images.add(image)
 

From 2c1f971b0603d24287cf5d7ad0697ae21bb3f402 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 3 Mar 2026 12:00:31 -0300
Subject: [PATCH 4/8] Address PR review: cleaner pd.isna guard, document
 heredoc tradeoff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace try/except pd.isna() with explicit isinstance guard — dicts
  and lists are already handled by prior elif branches, so pd.isna()
  is only called on scalar-like values
- Add comment documenting that the heredoc approach embeds config in
  the bash command, but this is acceptable since the config contains
  NPC scenario data, not secrets (API keys are resolved via env vars)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/openagentsafety/run_infer.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
index 54d9e5a9c..84850978e 100644
--- a/benchmarks/openagentsafety/run_infer.py
+++ b/benchmarks/openagentsafety/run_infer.py
@@ -49,12 +49,9 @@ def convert_numpy_types(obj: Any) -> Any:
         return {k: convert_numpy_types(v) for k, v in obj.items()}
     elif isinstance(obj, list):
         return [convert_numpy_types(item) for item in obj]
-    else:
-        try:
-            if pd.isna(obj):
-                return None
-        except (ValueError, TypeError):
-            pass
+    # pd.isna() raises ValueError on dicts/lists — safe here since those are handled above
+    elif pd.isna(obj):
+        return None
     return obj
 
 
@@ -70,11 +67,10 @@ def default(self, o):
             return o.tolist()
         elif hasattr(o, "model_dump"):
             return o.model_dump()
-        try:
-            if pd.isna(o):
-                return None
-        except (ValueError, TypeError):
-            pass
+        # JSONEncoder.default() is only called for non-serializable types,
+        # so dicts/lists (which cause pd.isna to raise) won't reach here.
+        elif pd.isna(o):
+            return None
         return super().default(o)
 
 
@@ -247,6 +243,11 @@ def write_npc_config(
     }
 
     config_json = json.dumps(config, indent=2, cls=NumpyEncoder)
+    # NOTE: The heredoc approach is simpler than the previous tempfile+upload but
+    # embeds config content in the bash command string, which could appear in
+    # container logs or process listings. This is acceptable here because the
+    # config contains NPC scenario data (not secrets) — API keys are resolved
+    # separately via environment variables and never written to this file.
     bash_command = f"""
 mkdir -p /npc
 cat > /npc/.npc_config.json << 'EOFNPC'

From aa613e99869aac4fc7bef7293feb1c986f182ae2 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 3 Mar 2026 12:13:01 -0300
Subject: [PATCH 5/8] Fix generate_report resolved/unresolved logic, add tests

The generate_report() function was treating all non-error instances as
resolved, leaving unresolved_ids always empty. Now uses the same logic
as eval_infer.py: an instance is resolved only when
final_score.result > 0 and result == total.

Added tests covering: mixed results, empty file, missing file,
malformed JSON lines, and missing final_score.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/openagentsafety/run_infer.py |  15 ++-
 tests/test_openagentsafety_report.py    | 125 ++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_openagentsafety_report.py

diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
index 84850978e..3c55b5310 100644
--- a/benchmarks/openagentsafety/run_infer.py
+++ b/benchmarks/openagentsafety/run_infer.py
@@ -569,7 +569,11 @@ def event_callback(event) -> None:
 
 def generate_report(output_jsonl: str, report_path: str, model_name: str) -> None:
     """Generate a .report.json from the output.jsonl, matching the format
-    expected by nemo_evaluator (same schema as SWE-Bench / GAIA reports)."""
+    used by other benchmarks (SWE-Bench, GAIA, etc.).
+
+    Resolution logic mirrors eval_infer.py: an instance is "resolved" only
+    when ``final_score.result > 0`` and ``final_score.result == final_score.total``.
+    """
     completed_ids: list[str] = []
     resolved_ids: list[str] = []
     unresolved_ids: list[str] = []
@@ -597,8 +601,13 @@ def generate_report(output_jsonl: str, report_path: str, model_name: str) -> Non
                 error_ids.append(instance_id)
             else:
                 completed_ids.append(instance_id)
-                # Treat as resolved when there is no error
-                resolved_ids.append(instance_id)
+                final_score = test_result.get("final_score", {})
+                result = final_score.get("result", 0)
+                total = final_score.get("total", 0)
+                if result > 0 and result == total:
+                    resolved_ids.append(instance_id)
+                else:
+                    unresolved_ids.append(instance_id)
 
     submitted_ids = completed_ids + error_ids
     report = {
diff --git a/tests/test_openagentsafety_report.py b/tests/test_openagentsafety_report.py
new file mode 100644
index 000000000..65dce8d6f
--- /dev/null
+++ b/tests/test_openagentsafety_report.py
@@ -0,0 +1,125 @@
+"""Tests for openagentsafety generate_report functionality."""
+
+import json
+import tempfile
+from pathlib import Path
+
+from benchmarks.openagentsafety.run_infer import generate_report
+
+
+def test_mixed_results():
+    """Resolved, unresolved, and error instances are classified correctly."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_jsonl = Path(tmpdir) / "output.jsonl"
+        report_path = Path(tmpdir) / "output.report.json"
+
+        lines = [
+            # resolved: result == total > 0
+            {
+                "instance_id": "resolved-1",
+                "test_result": {"final_score": {"result": 3, "total": 3}},
+            },
+            # unresolved: result < total
+            {
+                "instance_id": "unresolved-1",
+                "test_result": {"final_score": {"result": 1, "total": 3}},
+            },
+            # unresolved: result == 0
+            {
+                "instance_id": "unresolved-2",
+                "test_result": {"final_score": {"result": 0, "total": 2}},
+            },
+            # error at top level
+            {
+                "instance_id": "error-1",
+                "error": "timeout",
+                "test_result": {},
+            },
+            # error inside test_result
+            {
+                "instance_id": "error-2",
+                "test_result": {"error": "evaluation crashed"},
+            },
+        ]
+        output_jsonl.write_text(
+            "\n".join(json.dumps(l) for l in lines) + "\n"
+        )
+
+        generate_report(str(output_jsonl), str(report_path), "test-model")
+
+        report = json.loads(report_path.read_text())
+
+        assert report["model_name_or_path"] == "test-model"
+        assert report["resolved_instances"] == 1
+        assert report["resolved_ids"] == ["resolved-1"]
+        assert report["unresolved_instances"] == 2
+        assert sorted(report["unresolved_ids"]) == ["unresolved-1", "unresolved-2"]
+        assert report["error_instances"] == 2
+        assert report["completed_instances"] == 3
+        assert report["submitted_instances"] == 5
+        assert report["total_instances"] == 5
+
+
+def test_empty_file():
+    """An empty output.jsonl produces a report with all zeroes."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_jsonl = Path(tmpdir) / "output.jsonl"
+        report_path = Path(tmpdir) / "output.report.json"
+        output_jsonl.write_text("")
+
+        generate_report(str(output_jsonl), str(report_path), "test-model")
+
+        report = json.loads(report_path.read_text())
+        assert report["total_instances"] == 0
+        assert report["resolved_instances"] == 0
+        assert report["unresolved_instances"] == 0
+        assert report["error_instances"] == 0
+
+
+def test_missing_file():
+    """A missing output.jsonl produces no report file."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        report_path = Path(tmpdir) / "output.report.json"
+        generate_report("/nonexistent/output.jsonl", str(report_path), "m")
+        assert not report_path.exists()
+
+
+def test_malformed_json_lines_skipped():
+    """Malformed JSON lines are silently skipped."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_jsonl = Path(tmpdir) / "output.jsonl"
+        report_path = Path(tmpdir) / "output.report.json"
+
+        content = "not valid json\n" + json.dumps(
+            {
+                "instance_id": "good-1",
+                "test_result": {"final_score": {"result": 1, "total": 1}},
+            }
+        ) + "\n"
+        output_jsonl.write_text(content)
+
+        generate_report(str(output_jsonl), str(report_path), "test-model")
+
+        report = json.loads(report_path.read_text())
+        assert report["resolved_instances"] == 1
+        assert report["resolved_ids"] == ["good-1"]
+        assert report["total_instances"] == 1
+
+
+def test_missing_final_score_is_unresolved():
+    """An instance with no final_score is completed but unresolved."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_jsonl = Path(tmpdir) / "output.jsonl"
+        report_path = Path(tmpdir) / "output.report.json"
+
+        output_jsonl.write_text(
+            json.dumps({"instance_id": "no-score", "test_result": {}}) + "\n"
+        )
+
+        generate_report(str(output_jsonl), str(report_path), "test-model")
+
+        report = json.loads(report_path.read_text())
+        assert report["completed_instances"] == 1
+        assert report["resolved_instances"] == 0
+        assert report["unresolved_instances"] == 1
+        assert report["unresolved_ids"] == ["no-score"]

From ce17d9bce21cbd9e9494db29f587d0adf9523262 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 3 Mar 2026 12:33:32 -0300
Subject: [PATCH 6/8] Revert redundant argparse default for --timeout in
 swebench eval

The explicit default= and updated help/comment were unnecessary since
parser.set_defaults(**EVAL_DEFAULTS) already covers timeout. Keep only
the function signature defaults on run_swebench_evaluation().

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/swebench/eval_infer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index d91864d13..379d718da 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -257,11 +257,10 @@ def main() -> None:
     parser.add_argument(
         "--timeout",
         type=int,
-        default=EVAL_DEFAULTS["timeout"],
-        help=f"Timeout in seconds for evaluation (default: {EVAL_DEFAULTS['timeout']})",
+        help="Timeout in seconds for evaluation",
     )
 
-    # Apply EVAL_DEFAULTS from config (for dataset, split, workers)
+    # Apply EVAL_DEFAULTS from config (for dataset, split, workers, modal, timeout)
     parser.set_defaults(**EVAL_DEFAULTS)
 
     args = parser.parse_args()

From b1cecbcc34ce4d26d9b9d8a48d19c03370da1323 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 3 Mar 2026 12:35:37 -0300
Subject: [PATCH 7/8] Revert unnecessary eval_note style change in swtbench

The f-string and concatenation forms are functionally identical.
Keep the original f-string.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/swtbench/run_infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index 6394f1dca..bdfb7b13e 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -360,7 +360,7 @@ def main() -> None:
         dataset_name=dataset_description,
         model_name=llm.model,
         max_iterations=args.max_iterations,
-        eval_note="SWT-" + args.note if args.note else None,
+        eval_note=f"SWT-{args.note}" if args.note else None,
     )
 
     critic = create_critic(args)

From f578fd6a73f1c892e700c6409d532c10bef146da Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 3 Mar 2026 12:38:50 -0300
Subject: [PATCH 8/8] Fix pre-commit: rename ambiguous variable, fix ruff
 formatting

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/test_openagentsafety_report.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/tests/test_openagentsafety_report.py b/tests/test_openagentsafety_report.py
index 65dce8d6f..f21741548 100644
--- a/tests/test_openagentsafety_report.py
+++ b/tests/test_openagentsafety_report.py
@@ -41,9 +41,7 @@ def test_mixed_results():
                 "test_result": {"error": "evaluation crashed"},
             },
         ]
-        output_jsonl.write_text(
-            "\n".join(json.dumps(l) for l in lines) + "\n"
-        )
+        output_jsonl.write_text("\n".join(json.dumps(entry) for entry in lines) + "\n")
 
         generate_report(str(output_jsonl), str(report_path), "test-model")
 
@@ -90,12 +88,16 @@ def test_malformed_json_lines_skipped():
         output_jsonl = Path(tmpdir) / "output.jsonl"
         report_path = Path(tmpdir) / "output.report.json"
 
-        content = "not valid json\n" + json.dumps(
-            {
-                "instance_id": "good-1",
-                "test_result": {"final_score": {"result": 1, "total": 1}},
-            }
-        ) + "\n"
+        content = (
+            "not valid json\n"
+            + json.dumps(
+                {
+                    "instance_id": "good-1",
+                    "test_result": {"final_score": {"result": 1, "total": 1}},
+                }
+            )
+            + "\n"
+        )
         output_jsonl.write_text(content)
 
         generate_report(str(output_jsonl), str(report_path), "test-model")