diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index 6044479f6..a100a2cfb 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -592,6 +592,7 @@ def main() -> None:
         max_attempts=args.max_attempts,
         critic=critic,
         selected_instances_file=args.select,
+        max_retries=args.max_retries,
         workspace_type=args.workspace,
         enable_delegation=args.enable_delegation,
     )
diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py
index 3ecdeeb67..6c3ef3b9f 100644
--- a/benchmarks/multiswebench/build_images.py
+++ b/benchmarks/multiswebench/build_images.py
@@ -8,15 +8,16 @@
     --image ghcr.io/openhands/eval-agent-server --target source-minimal
 """
 
+import json
 import os
 from pathlib import Path
 
+from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
 from benchmarks.utils.build_utils import (
     build_all_images,
     default_build_output_dir,
     get_build_parser,
 )
-from benchmarks.utils.dataset import get_dataset
 from openhands.sdk import get_logger
 
 
@@ -37,7 +38,7 @@ def get_official_docker_image(
 
     # For Multi-SWE-Bench, the image naming depends on the language
     repo = instance["repo"]
-    version = instance["version"]
+    version = instance.get("version", "")
 
     if LANGUAGE == "python":
         # Use SWE-bench style naming for Python
@@ -52,7 +53,7 @@ def get_official_docker_image(
         else:
             org = instance.get("org", repo)
             repo_name = repo
-        official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base"
+        official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base".lower()
 
     logger.debug(f"Multi-SWE-Bench image: {official_image_name}")
     return official_image_name
@@ -79,12 +80,20 @@ def extract_custom_tag(base_image: str) -> str:
 
 def get_base_images_from_dataset(dataset_name: str, split: str) -> list[str]:
     """Get all unique base images from the dataset."""
-    dataset = get_dataset(dataset_name, split)
+    local_path = download_and_concat_dataset(dataset_name, LANGUAGE)
     base_images = set()
 
-    for _, row in dataset.iterrows():
-        image = get_official_docker_image(row.to_dict())
-        base_images.add(image)
+    with open(local_path, "r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            try:
+                instance = json.loads(line)
+            except json.JSONDecodeError as e:
+                logger.warning(f"Skipping malformed JSON line: {e}")
+                continue
+            image = get_official_docker_image(instance)
+            base_images.add(image)
 
     return list(base_images)
 
@@ -107,6 +116,7 @@ def main():
         build_dir=Path(
             args.output_dir or default_build_output_dir(args.dataset, args.split)
         ),
+        base_image_to_custom_tag_fn=extract_custom_tag,
         max_workers=args.num_workers,
         dry_run=False,
     )
diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py
index 43fc61bc5..0578f80e0 100644
--- a/benchmarks/multiswebench/eval_infer.py
+++ b/benchmarks/multiswebench/eval_infer.py
@@ -95,11 +95,13 @@ def run_multi_swebench_evaluation(
             error_msg = f"Evaluation failed with return code {result.returncode}"
             print(f"ERROR: {error_msg}")
             logger.error(error_msg)
+            raise subprocess.CalledProcessError(result.returncode, cmd)
 
     except Exception as e:
         error_msg = f"Error running evaluation: {e}"
         print(f"ERROR: {error_msg}")
         logger.error(error_msg)
+        raise
 
 
 def main():
@@ -139,7 +141,7 @@ def main():
         logger.info(f"Results saved to {results_file}")
 
         # Move the report file to the output location
-        output_report_path = args.input_file.with_suffix(".report.json")
+        output_report_path = Path(args.input_file).with_suffix(".report.json")
         shutil.move(str(results_file), str(output_report_path))
         logger.info(f"Report moved to {output_report_path}")
 
diff --git a/benchmarks/openagentsafety/build_images.py b/benchmarks/openagentsafety/build_images.py
index acb183844..29a39e478 100644
--- a/benchmarks/openagentsafety/build_images.py
+++ b/benchmarks/openagentsafety/build_images.py
@@ -1,6 +1,7 @@
 """Build OpenAgentSafety Docker image from vendor/software-agent-sdk"""
 
 import logging
+import os
 import subprocess
 from pathlib import Path
 
@@ -31,6 +32,16 @@ def get_vendor_sdk_commit() -> str:
     return result.stdout.strip()
 
 
+def get_image_name() -> str:
+    image_name = os.getenv("EVAL_AGENT_SERVER_IMAGE", "openagentsafety-agent-server")
+    tag_prefix = os.getenv("IMAGE_TAG_PREFIX")
+    if tag_prefix:
+        tag = f"{tag_prefix}-openagentsafety"
+    else:
+        tag = get_vendor_sdk_commit()
+    return f"{image_name}:{tag}"
+
+
 def check_image_exists(image_name: str) -> bool:
     """Check if a Docker image exists locally."""
     result = subprocess.run(
@@ -48,13 +59,14 @@ def build_workspace_image(force_rebuild: bool = False, no_cache: bool = False) -
         force_rebuild: if True, ignore existing images and rebuild.
         no_cache: if True, pass --no-cache to docker build to avoid layer cache.
     """
-    sdk_commit = get_vendor_sdk_commit()
-    image_name = f"openagentsafety-agent-server:{sdk_commit}"
+    image_name = get_image_name()
 
     if not force_rebuild and check_image_exists(image_name):
         logger.info(f"#### Using existing image: {image_name}")
         return image_name
 
+    sdk_commit = get_vendor_sdk_commit()
+
     logger.info(f"#### Building Docker image: {image_name}")
     logger.info(f"#### SDK version: {sdk_commit}")
     logger.info("#### This will take approximately 3-5 minutes...")
diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
index 3691e6a96..3c55b5310 100644
--- a/benchmarks/openagentsafety/run_infer.py
+++ b/benchmarks/openagentsafety/run_infer.py
@@ -4,7 +4,6 @@
 import json
 import os
 import subprocess
-import tempfile
 import time
 from typing import Any, List
 
@@ -13,7 +12,11 @@
 import requests
 from jinja2 import Environment, FileSystemLoader
 
-from benchmarks.openagentsafety.build_images import build_workspace_image
+from benchmarks.openagentsafety.build_images import (
+    build_workspace_image,
+    check_image_exists,
+    get_image_name,
+)
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.console_logging import summarize_instance
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -42,12 +45,13 @@ def convert_numpy_types(obj: Any) -> Any:
         return float(obj)
     elif isinstance(obj, np.ndarray):
         return obj.tolist()
-    elif pd.isna(obj):
-        return None
     elif isinstance(obj, dict):
         return {k: convert_numpy_types(v) for k, v in obj.items()}
     elif isinstance(obj, list):
         return [convert_numpy_types(item) for item in obj]
+    # pd.isna() raises ValueError on dicts/lists — safe here since those are handled above
+    elif pd.isna(obj):
+        return None
     return obj
 
 
@@ -61,6 +65,10 @@ def default(self, o):
             return float(o)
         elif isinstance(o, np.ndarray):
             return o.tolist()
+        elif hasattr(o, "model_dump"):
+            return o.model_dump()
+        # JSONEncoder.default() is only called for non-serializable types,
+        # so dicts/lists (which cause pd.isna to raise) won't reach here.
         elif pd.isna(o):
             return None
         return super().default(o)
@@ -187,7 +195,7 @@ def cleanup_docker_containers():
                 "-a",
                 "-q",
                 "--filter",
-                "ancestor=openagentsafety-agent-server:local",
+                f"ancestor={get_image_name()}",
             ],
             capture_output=True,
             text=True,
@@ -235,39 +243,25 @@ def write_npc_config(
     }
 
     config_json = json.dumps(config, indent=2, cls=NumpyEncoder)
+    # NOTE: The heredoc approach is simpler than the previous tempfile+upload but
+    # embeds config content in the bash command string, which could appear in
+    # container logs or process listings. This is acceptable here because the
+    # config contains NPC scenario data (not secrets) — API keys are resolved
+    # separately via environment variables and never written to this file.
+    bash_command = f"""
+mkdir -p /npc
+cat > /npc/.npc_config.json << 'EOFNPC'
+{config_json}
+EOFNPC
+chmod 600 /npc/.npc_config.json
+"""
 
-    # Create /npc directory in container (doesn't leak sensitive info)
-    try:
-        workspace.execute_command("mkdir -p /npc", timeout=30)
-    except Exception as e:
-        logger.error(f"Failed to create /npc directory: {e}")
-        raise
-
-    # Write config to temporary file on host
-    temp_fd, temp_path = tempfile.mkstemp(suffix=".json", text=True)
     try:
-        with os.fdopen(temp_fd, "w") as f:
-            f.write(config_json)
-
-        # Upload file to container using file_upload (avoids bash command leak)
-        result = workspace.file_upload(
-            source_path=temp_path, destination_path="/npc/.npc_config.json"
-        )
-
-        if not result.success:
-            raise RuntimeError(f"File upload failed: {result}")
-
-        # Set restrictive permissions
-        workspace.execute_command("chmod 600 /npc/.npc_config.json", timeout=30)
-
-        logger.info("Wrote NPC config to /npc/.npc_config.json (via file_upload)")
+        workspace.execute_command(bash_command, timeout=60)
+        logger.info("Wrote NPC config to /npc/.npc_config.json")
     except Exception as e:
         logger.error(f"Failed to write NPC config: {e}")
         raise
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_path):
-            os.unlink(temp_path)
 
 
 def generate_instruction(instance_data: dict, template_path: str | None = None) -> str:
@@ -401,7 +395,18 @@ def prepare_workspace(
             resource_factor: Resource factor for runtime allocation (default: 1).
             forward_env: Environment variables to forward into the workspace.
         """
-        server_image = build_workspace_image()
+        # Try to build image on-the-fly, fall back to pre-built if build fails
+        try:
+            server_image = build_workspace_image()
+        except (subprocess.CalledProcessError, RuntimeError) as e:
+            logger.warning(f"On-the-fly build failed: {e}")
+            server_image = get_image_name()
+
+            if not check_image_exists(server_image):
+                raise RuntimeError(
+                    f"On-the-fly build failed and pre-built image {server_image} does not exist"
+                )
+            logger.info(f"Using pre-built image {server_image}")
 
         workspace = DockerWorkspace(
             server_image=server_image,
@@ -562,6 +567,77 @@ def event_callback(event) -> None:
         )
 
 
+def generate_report(output_jsonl: str, report_path: str, model_name: str) -> None:
+    """Generate a .report.json from the output.jsonl, matching the format
+    used by other benchmarks (SWE-Bench, GAIA, etc.).
+
+    Resolution logic mirrors eval_infer.py: an instance is "resolved" only
+    when ``final_score.result > 0`` and ``final_score.result == final_score.total``.
+    """
+    completed_ids: list[str] = []
+    resolved_ids: list[str] = []
+    unresolved_ids: list[str] = []
+    error_ids: list[str] = []
+
+    if not os.path.exists(output_jsonl):
+        logger.warning("No output.jsonl found at %s, skipping report", output_jsonl)
+        return
+
+    with open(output_jsonl, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            instance_id = data.get("instance_id", "")
+            error = data.get("error")
+            test_result = data.get("test_result", {})
+
+            if error or test_result.get("error"):
+                error_ids.append(instance_id)
+            else:
+                completed_ids.append(instance_id)
+                final_score = test_result.get("final_score", {})
+                result = final_score.get("result", 0)
+                total = final_score.get("total", 0)
+                if result > 0 and result == total:
+                    resolved_ids.append(instance_id)
+                else:
+                    unresolved_ids.append(instance_id)
+
+    submitted_ids = completed_ids + error_ids
+    report = {
+        "model_name_or_path": model_name,
+        "total_instances": len(submitted_ids),
+        "submitted_instances": len(submitted_ids),
+        "completed_instances": len(completed_ids),
+        "incomplete_instances": 0,
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "empty_patch_instances": 0,
+        "error_instances": len(error_ids),
+        "submitted_ids": submitted_ids,
+        "completed_ids": completed_ids,
+        "incomplete_ids": [],
+        "resolved_ids": resolved_ids,
+        "unresolved_ids": unresolved_ids,
+    }
+
+    with open(report_path, "w") as f:
+        json.dump(report, f, indent=4)
+
+    logger.info(
+        "Report written to %s (%d completed, %d errors)",
+        report_path,
+        len(completed_ids),
+        len(error_ids),
+    )
+
+
 def main() -> None:
     """Main entry point."""
     parser = get_parser(add_llm_config=True)
@@ -573,6 +649,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
+    # Load LLM config
     llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
@@ -600,13 +677,14 @@ def main() -> None:
         max_iterations=args.max_iterations,
         eval_output_dir=structured_output_dir,
         details={
-            "server_image": "openagentsafety-agent-server:local",
+            "server_image": get_image_name(),
             "platform": "linux/amd64",
         },
         eval_limit=args.n_limit,
         max_attempts=args.max_attempts,
         critic=critic,
         selected_instances_file=args.select,
+        max_retries=args.max_retries,
         enable_delegation=args.enable_delegation,
     )
 
@@ -663,6 +741,10 @@ def _cb(instance: EvalInstance, out: EvalOutput) -> None:
     # Run evaluation
     evaluator.run(on_result=_default_on_result_writer(metadata.eval_output_dir))
 
+    # Generate .report.json for nemo_evaluator compatibility
+    report_path = os.path.join(metadata.eval_output_dir, "output.report.json")
+    generate_report(evaluator.output_path, report_path, llm.model)
+
     # Final cleanup
     cleanup_docker_containers()
 
diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index ab52b26ec..379d718da 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -117,11 +117,11 @@ def convert_to_swebench_format(input_file: str, output_file: str) -> None:
 def run_swebench_evaluation(
     predictions_file: str,
     run_id: str,
-    dataset: str,
-    workers: int,
-    split: str,
-    modal: bool,
-    timeout: int,
+    dataset: str = EVAL_DEFAULTS["dataset"],
+    workers: int = EVAL_DEFAULTS["workers"],
+    split: str = EVAL_DEFAULTS["split"],
+    modal: bool = EVAL_DEFAULTS["modal"],
+    timeout: int = EVAL_DEFAULTS["timeout"],
 ) -> None:
     """
     Run SWE-Bench evaluation on the predictions file.
diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py
index b3adec48b..737213680 100644
--- a/benchmarks/swebenchmultimodal/eval_infer.py
+++ b/benchmarks/swebenchmultimodal/eval_infer.py
@@ -257,6 +257,7 @@ def run_swebench_multimodal_evaluation(
         split: Dataset split to use (default: dev)
         workers: Number of workers to use for evaluation
         run_id: Optional run ID for the evaluation
+        modal: Whether to use Modal for evaluation (default: True)
 
     Returns:
         Path to the generated report.json file, or None if not found
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index 57ee83506..bdfb7b13e 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -25,7 +25,6 @@
     EvalOutput,
 )
 from benchmarks.utils.version import IMAGE_TAG_PREFIX
-from openhands.agent_server.docker.build import _base_slug
 from openhands.sdk import Agent, Conversation, Tool, __version__, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.delegate import DelegateTool
@@ -55,6 +54,10 @@ def get_agent_server_docker_image(
     target: str = "source-minimal",
 ) -> str:
     """Get the agent server Docker image for an instance."""
+    # Importing here because openhands.agent_server.docker.build runs git checks
+    # which fails when installed as a package outside the git repo
+    from openhands.agent_server.docker.build import _base_slug
+
     official_image_name = get_official_docker_image(instance_id, docker_image_prefix)
     return (
         "ghcr.io/all-hands-ai/agent-server"
diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
index f61110729..b4e56b9ad 100644
--- a/benchmarks/utils/build_utils.py
+++ b/benchmarks/utils/build_utils.py
@@ -20,6 +20,7 @@
 from pydantic import BaseModel, Field
 from tqdm.auto import tqdm
 
+from benchmarks.swebench.constants import TargetType
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.buildx_utils import (
     buildkit_disk_usage,
@@ -28,7 +29,6 @@
 )
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.image_utils import local_image_exists, remote_image_exists
-from openhands.agent_server.docker.build import BuildOptions, TargetType, build
 from openhands.sdk import get_logger
 
 
@@ -282,6 +282,10 @@ def build_image(
     target: TargetType = "source-minimal",
     push: bool = False,
 ) -> BuildOutput:
+    # Importing here because openhands.agent_server.docker.build runs git checks
+    # which fails when installed as a package outside the git repo
+    from openhands.agent_server.docker.build import BuildOptions, build
+
     # Get SDK info from submodule to ensure tags use the correct SDK SHA
     git_ref, git_sha, sdk_version = _get_sdk_submodule_info()
 
diff --git a/tests/test_openagentsafety_report.py b/tests/test_openagentsafety_report.py
new file mode 100644
index 000000000..f21741548
--- /dev/null
+++ b/tests/test_openagentsafety_report.py
@@ -0,0 +1,127 @@
+"""Tests for openagentsafety generate_report functionality."""
+
+import json
+import tempfile
+from pathlib import Path
+
+from benchmarks.openagentsafety.run_infer import generate_report
+
+
+def test_mixed_results():
+    """Resolved, unresolved, and error instances are classified correctly."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_jsonl = Path(tmpdir) / "output.jsonl"
+        report_path = Path(tmpdir) / "output.report.json"
+
+        lines = [
+            # resolved: result == total > 0
+            {
+                "instance_id": "resolved-1",
+                "test_result": {"final_score": {"result": 3, "total": 3}},
+            },
+            # unresolved: result < total
+            {
+                "instance_id": "unresolved-1",
+                "test_result": {"final_score": {"result": 1, "total": 3}},
+            },
+            # unresolved: result == 0
+            {
+                "instance_id": "unresolved-2",
+                "test_result": {"final_score": {"result": 0, "total": 2}},
+            },
+            # error at top level
+            {
+                "instance_id": "error-1",
+                "error": "timeout",
+                "test_result": {},
+            },
+            # error inside test_result
+            {
+                "instance_id": "error-2",
+                "test_result": {"error": "evaluation crashed"},
+            },
+        ]
+        output_jsonl.write_text("\n".join(json.dumps(entry) for entry in lines) + "\n")
+
+        generate_report(str(output_jsonl), str(report_path), "test-model")
+
+        report = json.loads(report_path.read_text())
+
+        assert report["model_name_or_path"] == "test-model"
+        assert report["resolved_instances"] == 1
+        assert report["resolved_ids"] == ["resolved-1"]
+        assert report["unresolved_instances"] == 2
+        assert sorted(report["unresolved_ids"]) == ["unresolved-1", "unresolved-2"]
+        assert report["error_instances"] == 2
+        assert report["completed_instances"] == 3
+        assert report["submitted_instances"] == 5
+        assert report["total_instances"] == 5
+
+
+def test_empty_file():
+    """An empty output.jsonl produces a report with all zeroes."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_jsonl = Path(tmpdir) / "output.jsonl"
+        report_path = Path(tmpdir) / "output.report.json"
+        output_jsonl.write_text("")
+
+        generate_report(str(output_jsonl), str(report_path), "test-model")
+
+        report = json.loads(report_path.read_text())
+        assert report["total_instances"] == 0
+        assert report["resolved_instances"] == 0
+        assert report["unresolved_instances"] == 0
+        assert report["error_instances"] == 0
+
+
+def test_missing_file():
+    """A missing output.jsonl produces no report file."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        report_path = Path(tmpdir) / "output.report.json"
+        generate_report("/nonexistent/output.jsonl", str(report_path), "m")
+        assert not report_path.exists()
+
+
+def test_malformed_json_lines_skipped():
+    """Malformed JSON lines are silently skipped."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_jsonl = Path(tmpdir) / "output.jsonl"
+        report_path = Path(tmpdir) / "output.report.json"
+
+        content = (
+            "not valid json\n"
+            + json.dumps(
+                {
+                    "instance_id": "good-1",
+                    "test_result": {"final_score": {"result": 1, "total": 1}},
+                }
+            )
+            + "\n"
+        )
+        output_jsonl.write_text(content)
+
+        generate_report(str(output_jsonl), str(report_path), "test-model")
+
+        report = json.loads(report_path.read_text())
+        assert report["resolved_instances"] == 1
+        assert report["resolved_ids"] == ["good-1"]
+        assert report["total_instances"] == 1
+
+
+def test_missing_final_score_is_unresolved():
+    """An instance with no final_score is completed but unresolved."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_jsonl = Path(tmpdir) / "output.jsonl"
+        report_path = Path(tmpdir) / "output.report.json"
+
+        output_jsonl.write_text(
+            json.dumps({"instance_id": "no-score", "test_result": {}}) + "\n"
+        )
+
+        generate_report(str(output_jsonl), str(report_path), "test-model")
+
+        report = json.loads(report_path.read_text())
+        assert report["completed_instances"] == 1
+        assert report["resolved_instances"] == 0
+        assert report["unresolved_instances"] == 1
+        assert report["unresolved_ids"] == ["no-score"]