From 0f43fb85330c58ca5ea1fda594b3b8db8bc4df71 Mon Sep 17 00:00:00 2001
From: "Xingdi (Eric) Yuan" <xingdi-eric-yuan@users.noreply.github.com>
Date: Tue, 25 Nov 2025 14:00:45 -0500
Subject: [PATCH 01/12] free env
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update free_env.py

remove default instructions

submit tool

Update free_env.py

override sys prompt

refactor free_env

Update config_free_env.yaml

add in script the apply_eval: false

add back tree

Update test_free_env.py

Update run_free_env.py

being able to reset

Fix Kube double reservation (#285)

* fix error double reservation

* black

---------

Co-authored-by: Alessandro Sordoni <alsordon@microsoft.com>

add terminal type in yaml

Use None is info doesn't exists in base agent (#290)

Update run_free_env.py

Update config_free_env.yaml

Rename apply_eval to eval_on_submit

Agent args dataclass (#287)

* fix error double reservation

* black

* agent args

* Clean up env_kwargs

* Fixing tests

* Remove env from agent init

* More changes to agents

* Fix task name

* Fix tests

---------

Co-authored-by: Alessandro Sordoni <alsordon@microsoft.com>
Co-authored-by: Marc-Alexandre Côté <marc.cote.19@gmail.com>

Revert "Merge branch 'main' into free_env"

This reverts commit 30108f3db9356227a3a5cca6a10c00e12c487b70, reversing
changes made to 391e9d3f6a02fdb0eb045d33dab858a409de230e.

Update kubernetes.py

Fix logger when running in single-thread

Adding a bunch of logging + avoid recreating the pod if it stopped running.

20 seconds -> 20 mins refresh token

renew AOAI client every hour to avoid ClientAuthenticationError
---
 debug_gym/agents/__init__.py          |   2 +
 debug_gym/agents/base_agent.py        |  12 +-
 debug_gym/agents/free_agent.py        |  44 ++
 debug_gym/gym/envs/__init__.py        |   3 +
 debug_gym/gym/envs/env.py             |   9 +-
 debug_gym/gym/envs/free_env.py        | 196 +++++++++
 debug_gym/gym/envs/mini_nightmare.py  |   4 +-
 debug_gym/gym/terminals/__init__.py   |  27 +-
 debug_gym/gym/terminals/kubernetes.py | 239 ++++++++---
 debug_gym/gym/tools/pdb.py            |   3 +-
 debug_gym/gym/tools/submit.py         |  11 +-
 debug_gym/llms/azure_openai.py        |  40 +-
 debug_gym/llms/copilot.py             |   2 +-
 debug_gym/llms/openai.py              |  22 +-
 debug_gym/logger.py                   |   5 +
 notebooks/tutorial.ipynb              | 586 ++++++++++++++++++++------
 requirements.txt                      |   2 +-
 scripts/config_free_env.yaml          |  42 ++
 scripts/free_env_human.py             | 200 +++++++++
 scripts/run_free_env.py               | 163 +++++++
 tests/agents/conftest.py              |   4 +-
 tests/agents/test_free_agent.py       |  62 +++
 tests/gym/envs/test_free_env.py       | 133 ++++++
 tests/gym/terminals/test_terminal.py  |  34 ++
 24 files changed, 1636 insertions(+), 209 deletions(-)
 create mode 100644 debug_gym/agents/free_agent.py
 create mode 100644 debug_gym/gym/envs/free_env.py
 create mode 100644 scripts/config_free_env.yaml
 create mode 100644 scripts/free_env_human.py
 create mode 100644 scripts/run_free_env.py
 create mode 100644 tests/agents/test_free_agent.py
 create mode 100644 tests/gym/envs/test_free_env.py

diff --git a/debug_gym/agents/__init__.py b/debug_gym/agents/__init__.py
index 424ccc9c..fbfbae31 100644
--- a/debug_gym/agents/__init__.py
+++ b/debug_gym/agents/__init__.py
@@ -1,4 +1,6 @@
+from debug_gym.agents.base_agent import BaseAgent, register_agent
 from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent
+from debug_gym.agents.free_agent import FreeAgent
 from debug_gym.agents.rewrite_agent import RewriteAgent
 from debug_gym.agents.solution_agent import AgentSolution
 from debug_gym.agents.swe_agent import SWEAgent
diff --git a/debug_gym/agents/base_agent.py b/debug_gym/agents/base_agent.py
index 458986a8..67f95398 100644
--- a/debug_gym/agents/base_agent.py
+++ b/debug_gym/agents/base_agent.py
@@ -45,10 +45,8 @@ def __init__(
         self._uuid = self.config.get("uuid", str(uuid.uuid4()))
         self._output_path = pjoin(self.config["output_path"], self._uuid)
 
-        os.makedirs(self._output_path, exist_ok=True)
-
-        self.set_seed(self.config["random_seed"])
-        self.history = HistoryTracker(self.config["memory_size"])
+        self.set_seed(self.args.random_seed)
+        self.history = HistoryTracker(self.args.memory_size)
 
     def set_seed(self, seed):
         np.random.seed(seed)
@@ -223,7 +221,7 @@ def build_prompt(self, info):
     def run(self, task_name=None, debug=False):
         step = 0
         info = None
-        max_steps = self.config["max_steps"]
+        max_steps = self.args.max_steps
         try:
             self.history.reset()
             info = self.env.reset(options={"task_name": task_name})
@@ -268,7 +266,7 @@ def run(self, task_name=None, debug=False):
 
                 if (
                     info.terminated
-                    or info.rewrite_counter >= self.config["max_rewrite_steps"]
+                    or info.rewrite_counter >= self.args.max_rewrite_steps
                 ):
                     reason = (
                         "terminated" if info.resolved else "max_rewrite_steps reached"
@@ -312,7 +310,7 @@ def run(self, task_name=None, debug=False):
                 step=step + 1,
                 total_steps=step + 1,
                 score=info.score if info else 0,
-                max_score=info.max_score,
+                max_score=info.max_score if info else None,
                 status="error",
             )
             raise
diff --git a/debug_gym/agents/free_agent.py b/debug_gym/agents/free_agent.py
new file mode 100644
index 00000000..951bde3b
--- /dev/null
+++ b/debug_gym/agents/free_agent.py
@@ -0,0 +1,44 @@
+"""Simple agent example for interacting with FreeEnv."""
+
+from debug_gym.agents.base_agent import BaseAgent, register_agent
+
+
+@register_agent
+class FreeAgent(BaseAgent):
+    """Minimal reasoning agent tailored for FreeEnv sessions."""
+
+    name = "free_agent"
+    # Customized system instructions keep FreeEnv light-weight while still
+    # providing the model with a structured exploration checklist.
+    system_prompt = (
+        "You are assisting in an exploratory codebase understanding session inside an open-ended container.\n"
+        "You have access to a set of tools to inspect and modify the codebase.\n"
+        "Your goal is to use the tools to gather as much information about the codebase as possible.\n"
+        "Output both your thinking process (if any) and the tool call (must) in the response.\n"
+        "When you are done exploring, use the submit tool as the final action to end the session."
+    )
+
+    def __init__(self, config, env, llm=None, logger=None):
+        super().__init__(config=config, env=env, llm=llm, logger=logger)
+
+        override_prompt = config.get("system_prompt")
+        if override_prompt is not None:
+            self.system_prompt = str(override_prompt)
+
+    def run(self, task_name=None, debug=False):
+        """Wrap BaseAgent.run to surface clearer errors when startup fails."""
+        try:
+            return super().run(task_name=task_name, debug=debug)
+        except AttributeError as exc:
+            error_msg = str(exc)
+            sentinel = "'NoneType' object has no attribute 'max_score'"
+            if sentinel not in error_msg:
+                raise
+
+            root_cause = exc.__context__ or exc.__cause__ or exc
+            self.logger.error(
+                "FreeAgent failed to reset the environment before receiving initial observations. "
+                "Check that the configured container image exists and is accessible."
+            )
+
+            raise root_cause
diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py
index 86ef4cab..89dfd501 100644
--- a/debug_gym/gym/envs/__init__.py
+++ b/debug_gym/gym/envs/__init__.py
@@ -1,5 +1,6 @@
 from debug_gym.gym.envs.aider import AiderBenchmarkEnv
 from debug_gym.gym.envs.env import RepoEnv, TooledEnv
+from debug_gym.gym.envs.free_env import FreeEnv
 from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
 from debug_gym.gym.envs.r2egym import R2EGymEnv
 from debug_gym.gym.envs.swe_bench import SWEBenchEnv
@@ -23,5 +24,7 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
             return MiniNightmareEnv
         case "r2egym":
             return R2EGymEnv
+        case "free":
+            return FreeEnv
         case _:
             raise ValueError(f"Unknown benchmark {env_type}")
diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py
index e691a31b..210c92f6 100644
--- a/debug_gym/gym/envs/env.py
+++ b/debug_gym/gym/envs/env.py
@@ -40,14 +40,16 @@ def __str__(self) -> str:
         # Status section
         lines.append(
             f"📊 Status: {('✅' if self.resolved else '❌') + ' (TERMINATED)' if self.terminated else '🔄 (IN PROGRESS)'}\t"
-            f"🎯 Score: {self.score}/{self.max_score}\t"
+            f"🎯 Score: {self.score}/{self.max_score or '?'}\t"
             f"✏️ Rewrites: {self.rewrite_counter}"
         )
 
         # Action section
-        if self.action:
+        if self.action_tool_call:
             lines.append("🔧 Last Action:")
-            lines.append(f"   Tool: {self.action.name}")
+            lines.append(f"   Tool: {self.action_tool_call.name}")
+            if self.action_content:
+                lines.append(f"   Explanation: {self.action_content}")
             if self.action_reasoning:
                 lines.append(f"   Reasoning: {self.action_reasoning}")
             lines.append("")
@@ -336,6 +338,7 @@ def reset(self, *, options: dict = None):
         """Resets the environment and returns eval as the initial observation."""
         options = options or {}
         self.logger.debug("Resetting environment")
+        self.close()  # Clean up previous workspace and terminal.
         self.setup_task(task_name=options.get("task_name"), options=options)
         self.setup_workspace()
         self.setup_terminal()
diff --git a/debug_gym/gym/envs/free_env.py b/debug_gym/gym/envs/free_env.py
new file mode 100644
index 00000000..1942f249
--- /dev/null
+++ b/debug_gym/gym/envs/free_env.py
@@ -0,0 +1,196 @@
+from __future__ import annotations
+
+import shlex
+from pathlib import Path
+from typing import Any
+
+from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.terminals.local import LocalTerminal
+from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.logger import DebugGymLogger
+
+
+class FreeEnv(RepoEnv):
+    """Lightweight RepoEnv wrapper for running arbitrary container images."""
+
+    DEFAULT_TASK_NAME = "free-session"
+
+    def __init__(
+        self,
+        image: str,
+        *,
+        terminal: Terminal | None = None,
+        mount_path: str | Path | None = None,
+        setup_commands: list[str] | None = None,
+        instructions: str | None = None,
+        init_git: bool = True,
+        workspace_dir: str | Path = "/testbed",
+        logger: DebugGymLogger | None = None,
+        **env_kwargs: Any,
+    ) -> None:
+        """Create a free-form environment backed by an existing repository terminal."""
+        self.container_image = image
+        self._custom_instructions = (instructions or "").strip()
+        self.init_git = init_git
+        self._setup_commands = list(setup_commands or [])
+        self._workspace_dir = str(workspace_dir)
+
+        shared_logger = logger or DebugGymLogger("debug-gym")
+
+        super().__init__(
+            path=str(mount_path) if mount_path is not None else None,
+            entrypoint="true",
+            debug_entrypoint="true",
+            max_score=0,
+            terminal=terminal,
+            logger=shared_logger,
+            **env_kwargs,
+        )
+
+        if self.terminal is not None:
+            self._apply_terminal_settings()
+
+    def _apply_terminal_settings(self) -> None:
+        """Keep terminal metadata (image/setup commands) in sync with env state."""
+        terminal = self.terminal
+        if terminal is None:
+            return
+        if hasattr(terminal, "base_image"):
+            setattr(terminal, "base_image", self.container_image)
+
+        if hasattr(terminal, "setup_commands"):
+            terminal.setup_commands = list(self._setup_commands)
+
+        if hasattr(terminal, "working_dir") and not isinstance(terminal, LocalTerminal):
+            try:
+                terminal.working_dir = self._workspace_dir
+            except ValueError:
+                self.logger.debug(
+                    "Terminal already active; keeping working_dir=%s",
+                    getattr(terminal, "working_dir", self._workspace_dir),
+                )
+
+        if hasattr(terminal, "task_name"):
+            try:
+                terminal.task_name = self.DEFAULT_TASK_NAME
+            except ValueError:
+                self.logger.debug(
+                    "Terminal already active; keeping existing task name."
+                )
+
+        terminal.logger = self.logger
+
+    def load_dataset(self, problems: str | list[str] | None = None):
+        """Expose a single synthetic task keyed by DEFAULT_TASK_NAME."""
+        return {self.DEFAULT_TASK_NAME: {"image": self.container_image}}
+
+    def setup_task(self, task_name: str | None, options: dict | None = None) -> None:
+        """Record base image metadata for consistency with RepoEnv expectations."""
+        self.task_name = task_name or self.DEFAULT_TASK_NAME
+        self.base_image = self.container_image
+        if hasattr(self.terminal, "base_image"):
+            setattr(self.terminal, "base_image", self.base_image)
+
+    def setup_workspace(self) -> None:
+        """Ensure the remote workspace matches the configured working directory."""
+        if isinstance(self.terminal, LocalTerminal):
+            super().setup_workspace()
+            return
+
+        self.workspace.reset()
+        self.workspace.working_dir = Path(self._workspace_dir)
+        if self.terminal is not None:
+            current_dir = getattr(self.terminal, "working_dir", None)
+            if current_dir != self._workspace_dir:
+                try:
+                    self.terminal.working_dir = self._workspace_dir
+                except ValueError:
+                    self.logger.debug(
+                        "Terminal already active; keeping working_dir=%s", current_dir
+                    )
+            # Ensure core utilities exist before RepoEnv renders directory listings.
+            self.terminal.run(
+                "apt-get update -y && apt-get install -y tree", raises=True
+            )
+            self.terminal.run(
+                f"mkdir -p {shlex.quote(self._workspace_dir)}",
+                raises=True,
+            )
+
+        if self.path:
+            self.workspace.copy_content(self.path)
+
+        self.workspace.setup_file_filters()
+
+    def setup_terminal(self) -> None:
+        """Apply FreeEnv tweaks and reuse RepoEnv git bootstrapping when enabled."""
+        self._apply_terminal_settings()
+
+        if self.terminal is not None:
+            self.terminal.run("touch .debugignore .debugreadonly")
+
+        if not self.init_git:
+            return
+        if not self._git_available():
+            self.logger.debug(
+                "Git is not available in the container; skipping repository setup.",
+            )
+            return
+        super().setup_terminal()
+
+    def _git_available(self) -> bool:
+        """Check for git presence before attempting repository initialization."""
+        if self.terminal is None:
+            return False
+        success, _ = self.terminal.run("command -v git")
+        return success
+
+    @property
+    def instructions(self) -> str:
+        """Provide user-facing guidance, falling back to a generic sandbox blurb."""
+        return (
+            self._custom_instructions
+            or "You are placed in an isolated Linux environment, use the available tools to interact with the environment effectively."
+        )
+
+    def reset(self, *, options: dict | None = None):
+        """Allow callers to mutate container settings before delegating to RepoEnv."""
+        options = options or {}
+
+        image = options.get("image")
+        workspace_dir = options.get("workspace_dir")
+        setup_commands = options.get("setup_commands")
+        instructions = options.get("instructions")
+        init_git = options.get("init_git")
+
+        restart_terminal = False
+
+        if image and image != self.container_image:
+            self.container_image = image
+            restart_terminal = True
+
+        if workspace_dir and str(workspace_dir) != self._workspace_dir:
+            self._workspace_dir = str(workspace_dir)
+            restart_terminal = True
+
+        if setup_commands is not None:
+            new_commands = list(setup_commands)
+            if new_commands != self._setup_commands:
+                self._setup_commands = new_commands
+                restart_terminal = True
+
+        if instructions is not None:
+            self._custom_instructions = instructions
+
+        if init_git is not None:
+            self.init_git = bool(init_git)
+
+        if restart_terminal and self.terminal is not None:
+            try:
+                self.terminal.close()
+            except Exception as exc:  # noqa: BLE001 - diagnostics only
+                self.logger.debug("Failed to close terminal cleanly: %s", exc)
+
+        self._apply_terminal_settings()
+
+        return super().reset(options=options)
diff --git a/debug_gym/gym/envs/mini_nightmare.py b/debug_gym/gym/envs/mini_nightmare.py
index b850d087..b5cee0a8 100644
--- a/debug_gym/gym/envs/mini_nightmare.py
+++ b/debug_gym/gym/envs/mini_nightmare.py
@@ -115,14 +115,14 @@ def setup_task(self, task_name: str, options: dict = None):
     def setup_workspace(self):
         self.workspace.reset()
 
-        self.logger.info("Copying files..")
+        self.logger.debug("Copying files..")
         self.workspace.copy_content(
             src=self.current_task["codebase"], target=self.workspace.working_dir
         )
         self.workspace.setup_file_filters()  # Use codebase's .debugignore and .debugreadonly.
 
     def setup_terminal(self):
-        self.logger.info(f"Configuring {self.terminal}...")
+        self.logger.debug(f"Configuring {self.terminal}...")
 
         self.terminal.run("git init")
         self.terminal.run("git config user.name 'debug-gym'")
diff --git a/debug_gym/gym/terminals/__init__.py b/debug_gym/gym/terminals/__init__.py
index 068a8b6a..1f34e1d8 100644
--- a/debug_gym/gym/terminals/__init__.py
+++ b/debug_gym/gym/terminals/__init__.py
@@ -13,8 +13,20 @@ def select_terminal(
     if terminal_config is None:
         return None
 
+    if isinstance(terminal_config, Terminal):
+        return terminal_config
+
+    if not isinstance(terminal_config, dict):
+        raise TypeError(
+            "terminal configuration must be a dict, Terminal instance, or None",
+        )
+
+    config = dict(terminal_config)
+    terminal_type = str(config.pop("type", "")).lower()
+    if not terminal_type:
+        raise ValueError("Terminal configuration must include a 'type' key")
+
     logger = logger or DebugGymLogger("debug-gym")
-    terminal_type = terminal_config["type"]
     match terminal_type:
         case "docker":
             terminal_class = DockerTerminal
@@ -25,8 +37,17 @@ def select_terminal(
         case _:
             raise ValueError(f"Unknown terminal {terminal_type}")
 
+    extra_labels = config.pop("extra_labels", {}) or {}
+    if uuid is not None:
+        extra_labels = {**extra_labels, "uuid": uuid}
+
+    if terminal_class is KubernetesTerminal and extra_labels:
+        config["extra_labels"] = extra_labels
+
+    if terminal_class is not KubernetesTerminal:
+        config.pop("extra_labels", None)
+
     return terminal_class(
-        **terminal_config,
         logger=logger,
-        extra_labels={"uuid": uuid},
+        **config,
     )
diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index 33f36d0b..37b7a9b5 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -26,6 +26,12 @@
 NB_RETRIES_RUN = 50  # Number of retries for running a command
 
 
+class SandboxReservationError(Exception):
+    """Raised when a pod cannot be created due to sandbox name reservation conflict."""
+
+    pass
+
+
 def _clean_for_kubernetes(name: str) -> str:
     """Clean pod name to conform to Kubernetes naming conventions."""
     # replace any characters not in the regex with hyphens
@@ -94,6 +100,10 @@ def wait_for_pod_ready(self, timeout: int = 3600 * 2):
         self.logger.debug(f"{self} Waiting to be ready...")
 
         w = watch.Watch()
+        start_time = time.time()
+        sandbox_check_interval = 30  # Check for sandbox errors every 30 seconds
+        last_sandbox_check = 0
+
         try:
             for event in w.stream(
                 self.k8s_client.list_namespaced_pod,
@@ -114,10 +124,22 @@ def wait_for_pod_ready(self, timeout: int = 3600 * 2):
                     return
                 elif phase in ["Failed", "Unknown", "Succeeded"]:
                     raise ValueError(f"{self} is in {phase} state instead of running.")
-                elif phase == "Pending" and event_type == "ADDED":
-                    # Only log pending status on initial ADDED event or when reason changes
-                    self._log_pending_status(pod)
-
+                elif phase == "Pending":
+                    if event_type == "ADDED":
+                        # Only log pending status on initial ADDED event or when reason changes
+                        self._log_pending_status(pod)
+
+                    # Periodically check for sandbox reservation errors while pending
+                    elapsed = time.time() - start_time
+                    if elapsed - last_sandbox_check >= sandbox_check_interval:
+                        last_sandbox_check = elapsed
+                        if self._has_sandbox_reservation_error():
+                            raise SandboxReservationError(
+                                f"{self} has sandbox reservation conflict"
+                            )
+
+        except SandboxReservationError:
+            raise  # Re-raise sandbox errors without wrapping
         except Exception as e:
             self.logger.debug(f"{self} Error during pod watch: {e}")
             raise ValueError(f"Error watching pod {self.name}: {e}")
@@ -129,6 +151,24 @@ def wait_for_pod_ready(self, timeout: int = 3600 * 2):
             f"Pod {self.name} did not become ready within {timeout} seconds"
         )
 
+    def _has_sandbox_reservation_error(self) -> bool:
+        """Check pod events for sandbox reservation errors."""
+        try:
+            events = self.k8s_client.list_namespaced_event(
+                namespace=self.namespace,
+                field_selector=f"involvedObject.name={self.name}",
+            )
+            for event in events.items:
+                if event.reason == "FailedCreatePodSandBox" and event.message:
+                    if "is reserved for" in event.message:
+                        self.logger.warning(
+                            f"{self} Sandbox reservation conflict detected: {event.message}"
+                        )
+                        return True
+        except ApiException as e:
+            self.logger.debug(f"{self} Error checking pod events: {e}")
+        return False
+
     def _log_pending_status(self, pod):
         """Log pending status only if it's different from the last one."""
         if pod.status.conditions:
@@ -330,9 +370,47 @@ def default_shell_command(self) -> list[str]:
         bash_cmd = "/bin/bash --noprofile --norc --noediting"
         return f"kubectl {kubeconfig}exec -it {self.pod.name} -c main -n {self.pod.namespace} -- {bash_cmd}"
 
+    def _ensure_pod_running(self) -> None:
+        """Ensure the backing pod exists and is in Running phase."""
+        if self._pod is None:
+            self.setup_pod()
+            return
+
+        try:
+            if self._pod.is_running():
+                return
+        except Exception as exc:  # noqa: BLE001 - diagnostics only
+            self.logger.debug(f"{self._pod} status check failed: {exc}")
+
+        self.logger.debug(f"{self._pod} not running anymore.")
+
+        # Check logs and describe for diagnostics
+        try:
+            pod_logs = self.k8s_client.read_namespaced_pod_log(
+                name=self._pod.name, namespace=self._pod.namespace
+            )
+            pod_description = self.k8s_client.read_namespaced_pod(
+                name=self._pod.name, namespace=self._pod.namespace
+            )
+            self.logger.debug(
+                f"[{self._pod.name}] Pod logs before failure:\n{pod_logs}\n"
+                f"Pod description before failure:\n{pod_description}"
+            )
+        except Exception as log_exc:
+            self.logger.debug(
+                f"[{self._pod.name}] Failed to get pod logs/description: {log_exc}"
+            )
+
+        self.logger.debug(f"Cleaning up {self._pod} after failure.")
+        try:
+            self._pod.clean_up()
+        except Exception as exc:  # noqa: BLE001 - best-effort cleanup
+            self.logger.debug(f"Failed to clean up {self._pod}: {exc}")
+
+        raise RuntimeError("Pod is not running anymore.")
+
     def new_shell_session(self):
-        if not self.pod.is_running():
-            raise ValueError("Pod is not running. Cannot create shell session.")
+        self._ensure_pod_running()
 
         session = ShellSession(
             shell_command=self.default_shell_command,
@@ -376,8 +454,7 @@ def run(
         strip_output: bool = True,
     ) -> tuple[bool, str]:
         """Run a command in the pod. Return command status and output."""
-        if not self.pod.is_running():
-            raise ValueError("Pod is not running. Cannot run commands.")
+        self._ensure_pod_running()
 
         command = self.prepare_command(entrypoint)
 
@@ -415,8 +492,25 @@ def run(
             except ApiException as e:
                 success = False
                 self.logger.debug(
-                    f"[{self.pod.name}] Exception during command execution: {e}"
+                    f"[{self.pod.name}] Exception during command `{command}`: {e}"
                 )
+                # Get kubectl logs and describe for diagnostics
+                try:
+                    pod_logs = self.k8s_client.read_namespaced_pod_log(
+                        name=self.pod.name, namespace=self.pod.namespace
+                    )
+                    pod_description = self.k8s_client.read_namespaced_pod(
+                        name=self.pod.name, namespace=self.pod.namespace
+                    )
+                    self.logger.debug(
+                        f"[{self.pod.name}] Pod logs:\n{pod_logs}\n"
+                        f"Pod description:\n{pod_description}"
+                    )
+                except Exception as log_exc:
+                    self.logger.debug(
+                        f"[{self.pod.name}] Failed to get pod logs/description: {log_exc}"
+                    )
+
                 output = f"Command execution failed: {str(e)}"
                 backoff = random.uniform(5, 10)  # seconds
                 time.sleep(backoff)
@@ -431,15 +525,12 @@ def run(
         self.logger.debug(f"[{self.pod.name}] Output success `{success}`:\n{output}")
         return success, output
 
-    def setup_pod(self) -> None:
-        """Create and start a Kubernetes pod."""
+    def setup_pod(self, max_retries: int = 3) -> None:
+        """Create and start a Kubernetes pod.
 
-        pod_name = _clean_for_kubernetes(
-            self._pod_name or f"dbg-gym.{self.task_name}.{str(uuid.uuid4())[:8]}"
-        )
-        self.logger.debug(
-            f"Setting up pod {pod_name} with image: {self.registry}{self.base_image}"
-        )
+        If a sandbox reservation conflict is detected, the pod is cleaned up
+        and a new pod with a fresh UUID is created.
+        """
 
         # Render pod_spec_kwargs as a Jinja2 template, replace variables, then load as dict.
         pod_spec_yaml = dump(self.pod_spec_kwargs)
@@ -447,51 +538,81 @@ def setup_pod(self) -> None:
         rendered_yaml = pod_spec_template.render(os.environ)
         pod_spec_kwargs = safe_load(rendered_yaml)
 
-        # Create pod specification for Kubernetes.
-        pod_body = {
-            "apiVersion": "v1",
-            "kind": "Pod",
-            "metadata": {
-                "name": pod_name,
-                "namespace": self.namespace,
-                "labels": self.labels,
-            },
-            "spec": {
-                "activeDeadlineSeconds": 3600 * 24,  # a day
-                "restartPolicy": "Never",
-                "containers": [
-                    {
-                        "name": "main",
-                        "image": f"{self.registry}{self.base_image}",
-                        "imagePullPolicy": "IfNotPresent",
-                        "command": ["/bin/bash"],
-                        "args": ["-c", "sleep infinity"],
-                        "workingDir": self.working_dir,
-                        "stdin": True,
-                        "stdinOnce": False,
-                        "tty": True,
-                        "env": [
-                            {"name": k, "value": v} for k, v in self.env_vars.items()
-                        ],
-                        "resources": {
-                            "requests": {"cpu": "0.5", "memory": "1Gi"},
-                            "limits": {"cpu": "2", "memory": "8Gi"},
-                        },
-                    }
-                ],
-                **pod_spec_kwargs,  # e.g., nodeSelector, tolerations
-            },
-        }
+        for attempt in range(max_retries):
+            # Generate a new pod name for each attempt to avoid sandbox conflicts
+            pod_name = _clean_for_kubernetes(
+                self._pod_name or f"dbg-gym.{self.task_name}.{str(uuid.uuid4())[:8]}"
+            )
+            self.logger.debug(
+                f"Setting up pod {pod_name} (attempt {attempt + 1}/{max_retries}) "
+                f"with image: {self.registry}{self.base_image}"
+            )
 
-        try:
-            self._pod = Pod(self.k8s_client, pod_body, logger=self.logger)
+            # Create pod specification for Kubernetes.
+            pod_body = {
+                "apiVersion": "v1",
+                "kind": "Pod",
+                "metadata": {
+                    "name": pod_name,
+                    "namespace": self.namespace,
+                    "labels": self.labels,
+                },
+                "spec": {
+                    "activeDeadlineSeconds": 3600 * 24,  # a day
+                    "restartPolicy": "Never",
+                    "containers": [
+                        {
+                            "name": "main",
+                            "image": f"{self.registry}{self.base_image}",
+                            "imagePullPolicy": "IfNotPresent",
+                            "command": ["/bin/bash"],
+                            "args": ["-c", "sleep infinity"],
+                            "workingDir": self.working_dir,
+                            "stdin": True,
+                            "stdinOnce": False,
+                            "tty": True,
+                            "env": [
+                                {"name": k, "value": v}
+                                for k, v in self.env_vars.items()
+                            ],
+                            "resources": {
+                                "requests": {"cpu": "0.5", "memory": "1Gi"},
+                                "limits": {"cpu": "2", "memory": "8Gi"},
+                            },
+                        }
+                    ],
+                    **pod_spec_kwargs,  # e.g., nodeSelector, tolerations
+                },
+            }
 
-            # Run setup commands
-            self._run_setup_commands()
-            self.logger.debug(f"{self.pod} started successfully.")
+            try:
+                self._pod = Pod(self.k8s_client, pod_body, logger=self.logger)
 
-        except ApiException as e:
-            raise ValueError(f"Failed to create pod: {e}")
+                # Run setup commands
+                self._run_setup_commands()
+                self.logger.debug(f"{self.pod} started successfully.")
+                return  # Success, exit the retry loop
+
+            except SandboxReservationError as e:
+                self.logger.warning(
+                    f"Sandbox reservation conflict for {pod_name}: {e}. "
+                    f"Cleaning up and retrying with new pod name..."
+                )
+                # Clean up the failed pod
+                if self._pod is not None:
+                    self._pod.clean_up()
+                    self._pod = None
+
+                # Wait a bit before retrying to allow the container runtime to clean up
+                time.sleep(5)
+
+                if attempt == max_retries - 1:
+                    raise ValueError(
+                        f"Failed to create pod after {max_retries} attempts "
+                        f"due to sandbox reservation conflicts"
+                    )
+            except ApiException as e:
+                raise ValueError(f"Failed to create pod: {e}")
 
     def _run_setup_commands(self):
         """Run setup commands if any. If commands fail, delete the pod."""
diff --git a/debug_gym/gym/tools/pdb.py b/debug_gym/gym/tools/pdb.py
index c52840d4..46ed1720 100644
--- a/debug_gym/gym/tools/pdb.py
+++ b/debug_gym/gym/tools/pdb.py
@@ -107,7 +107,8 @@ def start_pdb(self, environment) -> str:
         self._session = environment.terminal.new_shell_session()
         # init pdb and wait for the prompt
         self.entrypoint = self.entrypoint or environment.debug_entrypoint
-        initial_output = self._session.start(self.entrypoint, read_until="(Pdb)")
+        initial_output = f"Starting pdb session with entrypoint: {self.entrypoint}\n"
+        initial_output += self._session.start(self.entrypoint, read_until="(Pdb)")
 
         if "The program finished and will be restarted" in initial_output:
             self.stop_pdb()
diff --git a/debug_gym/gym/tools/submit.py b/debug_gym/gym/tools/submit.py
index eb08e578..00514992 100644
--- a/debug_gym/gym/tools/submit.py
+++ b/debug_gym/gym/tools/submit.py
@@ -9,7 +9,14 @@ class SubmitTool(EnvironmentTool):
     description = "Submit your changes once the task is complete."
     arguments = {}
 
+    def __init__(self, eval_on_submit=True):
+        super().__init__()
+        self.eval_on_submit = eval_on_submit
+
     def use(self, environment, **kwargs) -> Observation:
-        eval_output = environment.eval()
+        output = "The agent terminated the session."
+        if self.eval_on_submit:
+            output = environment.eval().output
+
         environment.terminated = True
-        return Observation(self.name, eval_output.output)
+        return Observation(self.name, output)
diff --git a/debug_gym/llms/azure_openai.py b/debug_gym/llms/azure_openai.py
index ec0bc678..a966c6f4 100644
--- a/debug_gym/llms/azure_openai.py
+++ b/debug_gym/llms/azure_openai.py
@@ -1,5 +1,7 @@
 import logging
+import time
 
+from azure.core.exceptions import ClientAuthenticationError
 from openai import AzureOpenAI, PermissionDeniedError
 
 from debug_gym.llms.constants import LLM_API_KEY_PLACEHOLDER, LLM_SCOPE_PLACEHOLDER
@@ -14,10 +16,29 @@
 
 class AzureOpenAILLM(OpenAILLM):
 
+    CLIENT_MAX_AGE_SECONDS = 3600  # 1 hour
+    AUTH_RETRY_DELAY_SECONDS = 5
+
+    def __init__(self, model_name, logger=None, llm_config=None, llm_config_file=None):
+        super().__init__(model_name, logger, llm_config, llm_config_file)
+        self._client = None
+        self._client_created_at = 0
+
     @property
     def client(self):
-        if getattr(self, "_client", None) is None:
+        now = time.time()
+        reason = None
+
+        if self._client is None:
+            reason = "initialize"
+        elif now - self._client_created_at >= self.CLIENT_MAX_AGE_SECONDS:
+            reason = f"age>{self.CLIENT_MAX_AGE_SECONDS}s"
+
+        if reason:
+            self.logger.debug("Creating Azure OpenAI client (%s)", reason)
             self._client = self._get_azure_oai_client()
+            self._client_created_at = time.time()
+
         return self._client
 
     def _get_azure_oai_client(self):
@@ -74,3 +95,20 @@ def _get_azure_oai_client(self):
                 "Please provide an `api_key or `scope` in the configuration."
             )
         return aoai_client
+
+    def need_to_be_retried(self, exception) -> bool:
+        need_to_retry = super().need_to_be_retried(exception)
+
+        if isinstance(exception, ClientAuthenticationError):
+            self.logger.info(
+                "Client authentication error detected, refreshing Azure OpenAI client"
+            )
+            self._invalidate_client_cache()
+            need_to_retry = True
+            time.sleep(self.AUTH_RETRY_DELAY_SECONDS)
+
+        return need_to_retry
+
+    def _invalidate_client_cache(self):
+        self._client = None
+        self._client_created_at = 0
diff --git a/debug_gym/llms/copilot.py b/debug_gym/llms/copilot.py
index ee8075c6..1fc26697 100644
--- a/debug_gym/llms/copilot.py
+++ b/debug_gym/llms/copilot.py
@@ -23,7 +23,7 @@
 
 
 class CopilotLLM(OpenAILLM):
-    CLIENT_MAX_AGE_SECONDS = 20
+    CLIENT_MAX_AGE_SECONDS = 1200  # 20 minutes
     AUTH_RETRY_DELAY_SECONDS = 5
 
     def __init__(self, model_name, logger=None, llm_config=None, llm_config_file=None):
diff --git a/debug_gym/llms/openai.py b/debug_gym/llms/openai.py
index ed0b9006..150b123d 100644
--- a/debug_gym/llms/openai.py
+++ b/debug_gym/llms/openai.py
@@ -94,6 +94,7 @@ def need_to_be_retried(self, exception) -> bool:
             "openai.APIError",
             "openai.APIConnectionError",
             "openai.RateLimitError",
+            "openai.InternalServerError",
             "openai.PermissionDeniedError",
             "openai.BadRequestError",
             # Add more as needed
@@ -184,10 +185,24 @@ def parse_tool_call_response(self, response) -> ToolCall:
                 name="empty_tool_response",
                 arguments={},
             )
+        raw_arguments = response.function.arguments if response.function else "{}"
+        try:
+            parsed_arguments = json.loads(raw_arguments)
+            if not isinstance(parsed_arguments, dict):
+                raise ValueError("Tool arguments must decode to a JSON object")
+        except (json.JSONDecodeError, TypeError, ValueError) as exc:
+            snippet = (raw_arguments or "")[:2000]
+            self.logger.warning(
+                "Failed to decode tool call arguments as JSON (%s)."
+                " Falling back to empty arguments. Snippet: %s",
+                exc,
+                snippet,
+            )
+            parsed_arguments = {}
         return ToolCall(
             id=response.id,
             name=response.function.name,
-            arguments=json.loads(response.function.arguments),
+            arguments=parsed_arguments,
         )
 
     def format_tool_call_history(
@@ -261,6 +276,11 @@ def generate(self, messages, tools, **kwargs) -> LLMResponse:
             if self.is_context_length_error(e):
                 raise ContextLengthExceededError
             raise e
+        if not hasattr(response, "choices"):
+            raise RuntimeError(
+                "OpenAI chat completion returned unexpected payload without 'choices'"
+            )
+
         # LLM may select multiple tool calls, we only care about the first action
         if not response.choices[0].message.tool_calls:
             # LLM failed to call a tool
diff --git a/debug_gym/logger.py b/debug_gym/logger.py
index 805c807f..4b71aece 100644
--- a/debug_gym/logger.py
+++ b/debug_gym/logger.py
@@ -505,6 +505,7 @@ class DebugGymLogger(logging.Logger):
     LOG_QUEUE = mp.Queue(maxsize=10000)
     PROGRESS_QUEUE = mp.Queue(maxsize=50000)  # Increased from 10000 to 50000
     _is_worker = False
+    _main_process_logger = None
 
     @classmethod
     def is_worker(cls):
@@ -541,6 +542,8 @@ def __init__(
         self.propagate = False
 
         self.setLevel(level)  # Set logger level, might be overridden by file handler
+        if DebugGymLogger._main_process_logger is not None:
+            self._is_worker = True
 
         # Placeholders for rich live, log listener thread, and stop event
         # Will be initialized if the logger is the main process logger
@@ -550,6 +553,8 @@ def __init__(
         self._log_listener_thread = None  # Thread to process logs from workers
         if self.is_main():
             self._initialize_main_logger(level)
+            DebugGymLogger._main_process_logger = self
+
         self.log_file = None  # File handler for logging to a file
         self.log_dir = Path(log_dir) if log_dir else None
         if self.log_dir:  # Directory to store log files
diff --git a/notebooks/tutorial.ipynb b/notebooks/tutorial.ipynb
index df93130c..7e93b7d3 100644
--- a/notebooks/tutorial.ipynb
+++ b/notebooks/tutorial.ipynb
@@ -38,10 +38,243 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "cb1d612f",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Obtaining file:///workspaces/debug-gym\n",
+      "  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Checking if build backend supports build_editable ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build editable ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: datasets==3.6.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (3.6.0)\n",
+      "Requirement already satisfied: openai==1.99.9 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (1.99.9)\n",
+      "Requirement already satisfied: azure-identity==1.22.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (1.22.0)\n",
+      "Requirement already satisfied: tenacity==9.1.2 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (9.1.2)\n",
+      "Requirement already satisfied: pyyaml==6.0.2 in /home/codespace/.local/lib/python3.12/site-packages (from debug-gym==1.2.0) (6.0.2)\n",
+      "Requirement already satisfied: pytest==8.3.5 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (8.3.5)\n",
+      "Requirement already satisfied: gitignore_parser==0.1.12 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (0.1.12)\n",
+      "Requirement already satisfied: termcolor==3.1.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (3.1.0)\n",
+      "Requirement already satisfied: transformers==4.51.3 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (4.51.3)\n",
+      "Requirement already satisfied: tiktoken in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (0.12.0)\n",
+      "Requirement already satisfied: docker==7.1.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (7.1.0)\n",
+      "Requirement already satisfied: swebench==4.0.3 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (4.0.3)\n",
+      "Requirement already satisfied: swesmith==0.0.4 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (0.0.4)\n",
+      "Requirement already satisfied: prompt_toolkit==3.0.51 in /home/codespace/.local/lib/python3.12/site-packages (from debug-gym==1.2.0) (3.0.51)\n",
+      "Requirement already satisfied: anthropic==0.51.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (0.51.0)\n",
+      "Requirement already satisfied: jinja2==3.1.6 in /home/codespace/.local/lib/python3.12/site-packages (from debug-gym==1.2.0) (3.1.6)\n",
+      "Requirement already satisfied: rich==14.0.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (14.0.0)\n",
+      "Requirement already satisfied: kubernetes==33.1.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from debug-gym==1.2.0) (33.1.0)\n",
+      "Requirement already satisfied: anyio<5,>=3.5.0 in /home/codespace/.local/lib/python3.12/site-packages (from anthropic==0.51.0->debug-gym==1.2.0) (4.9.0)\n",
+      "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from anthropic==0.51.0->debug-gym==1.2.0) (1.9.0)\n",
+      "Requirement already satisfied: httpx<1,>=0.25.0 in /home/codespace/.local/lib/python3.12/site-packages (from anthropic==0.51.0->debug-gym==1.2.0) (0.28.1)\n",
+      "Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from anthropic==0.51.0->debug-gym==1.2.0) (0.11.1)\n",
+      "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from anthropic==0.51.0->debug-gym==1.2.0) (2.12.4)\n",
+      "Requirement already satisfied: sniffio in /home/codespace/.local/lib/python3.12/site-packages (from anthropic==0.51.0->debug-gym==1.2.0) (1.3.1)\n",
+      "Requirement already satisfied: typing-extensions<5,>=4.10 in /usr/local/python/3.12/lib/python3.12/site-packages (from anthropic==0.51.0->debug-gym==1.2.0) (4.15.0)\n",
+      "Requirement already satisfied: azure-core>=1.31.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from azure-identity==1.22.0->debug-gym==1.2.0) (1.36.0)\n",
+      "Requirement already satisfied: cryptography>=2.5 in /usr/local/python/3.12/lib/python3.12/site-packages (from azure-identity==1.22.0->debug-gym==1.2.0) (46.0.3)\n",
+      "Requirement already satisfied: msal>=1.30.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from azure-identity==1.22.0->debug-gym==1.2.0) (1.34.0)\n",
+      "Requirement already satisfied: msal-extensions>=1.2.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from azure-identity==1.22.0->debug-gym==1.2.0) (1.3.1)\n",
+      "Requirement already satisfied: filelock in /home/codespace/.local/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (3.13.1)\n",
+      "Requirement already satisfied: numpy>=1.17 in /home/codespace/.local/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (2.2.5)\n",
+      "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (22.0.0)\n",
+      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (0.3.8)\n",
+      "Requirement already satisfied: pandas in /home/codespace/.local/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (2.2.3)\n",
+      "Requirement already satisfied: requests>=2.32.2 in /home/codespace/.local/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (2.32.3)\n",
+      "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/python/3.12/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (4.67.1)\n",
+      "Requirement already satisfied: xxhash in /usr/local/python/3.12/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (3.6.0)\n",
+      "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/python/3.12/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (0.70.16)\n",
+      "Requirement already satisfied: fsspec<=2025.3.0,>=2023.1.0 in /home/codespace/.local/lib/python3.12/site-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0->debug-gym==1.2.0) (2024.6.1)\n",
+      "Requirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (0.36.0)\n",
+      "Requirement already satisfied: packaging in /home/codespace/.local/lib/python3.12/site-packages (from datasets==3.6.0->debug-gym==1.2.0) (25.0)\n",
+      "Requirement already satisfied: urllib3>=1.26.0 in /home/codespace/.local/lib/python3.12/site-packages (from docker==7.1.0->debug-gym==1.2.0) (2.4.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /home/codespace/.local/lib/python3.12/site-packages (from jinja2==3.1.6->debug-gym==1.2.0) (3.0.2)\n",
+      "Requirement already satisfied: certifi>=14.05.14 in /home/codespace/.local/lib/python3.12/site-packages (from kubernetes==33.1.0->debug-gym==1.2.0) (2025.4.26)\n",
+      "Requirement already satisfied: six>=1.9.0 in /home/codespace/.local/lib/python3.12/site-packages (from kubernetes==33.1.0->debug-gym==1.2.0) (1.17.0)\n",
+      "Requirement already satisfied: python-dateutil>=2.5.3 in /home/codespace/.local/lib/python3.12/site-packages (from kubernetes==33.1.0->debug-gym==1.2.0) (2.9.0.post0)\n",
+      "Requirement already satisfied: google-auth>=1.0.1 in /usr/local/python/3.12/lib/python3.12/site-packages (from kubernetes==33.1.0->debug-gym==1.2.0) (2.43.0)\n",
+      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /home/codespace/.local/lib/python3.12/site-packages (from kubernetes==33.1.0->debug-gym==1.2.0) (1.8.0)\n",
+      "Requirement already satisfied: requests-oauthlib in /usr/local/python/3.12/lib/python3.12/site-packages (from kubernetes==33.1.0->debug-gym==1.2.0) (2.0.0)\n",
+      "Requirement already satisfied: oauthlib>=3.2.2 in /usr/local/python/3.12/lib/python3.12/site-packages (from kubernetes==33.1.0->debug-gym==1.2.0) (3.3.1)\n",
+      "Requirement already satisfied: durationpy>=0.7 in /usr/local/python/3.12/lib/python3.12/site-packages (from kubernetes==33.1.0->debug-gym==1.2.0) (0.10)\n",
+      "Requirement already satisfied: wcwidth in /home/codespace/.local/lib/python3.12/site-packages (from prompt_toolkit==3.0.51->debug-gym==1.2.0) (0.2.13)\n",
+      "Requirement already satisfied: iniconfig in /usr/local/python/3.12/lib/python3.12/site-packages (from pytest==8.3.5->debug-gym==1.2.0) (2.3.0)\n",
+      "Requirement already satisfied: pluggy<2,>=1.5 in /usr/local/python/3.12/lib/python3.12/site-packages (from pytest==8.3.5->debug-gym==1.2.0) (1.6.0)\n",
+      "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from rich==14.0.0->debug-gym==1.2.0) (4.0.0)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from rich==14.0.0->debug-gym==1.2.0) (2.19.2)\n",
+      "Requirement already satisfied: beautifulsoup4 in /home/codespace/.local/lib/python3.12/site-packages (from swebench==4.0.3->debug-gym==1.2.0) (4.13.4)\n",
+      "Requirement already satisfied: chardet in /usr/local/python/3.12/lib/python3.12/site-packages (from swebench==4.0.3->debug-gym==1.2.0) (5.2.0)\n",
+      "Requirement already satisfied: ghapi in /usr/local/python/3.12/lib/python3.12/site-packages (from swebench==4.0.3->debug-gym==1.2.0) (1.0.8)\n",
+      "Requirement already satisfied: GitPython in /home/codespace/.local/lib/python3.12/site-packages (from swebench==4.0.3->debug-gym==1.2.0) (3.1.44)\n",
+      "Requirement already satisfied: modal in /usr/local/python/3.12/lib/python3.12/site-packages (from swebench==4.0.3->debug-gym==1.2.0) (1.2.1)\n",
+      "Requirement already satisfied: pre-commit in /usr/local/python/3.12/lib/python3.12/site-packages (from swebench==4.0.3->debug-gym==1.2.0) (4.3.0)\n",
+      "Requirement already satisfied: python-dotenv in /usr/local/python/3.12/lib/python3.12/site-packages (from swebench==4.0.3->debug-gym==1.2.0) (1.2.1)\n",
+      "Requirement already satisfied: unidiff in /usr/local/python/3.12/lib/python3.12/site-packages (from swebench==4.0.3->debug-gym==1.2.0) (0.7.5)\n",
+      "Requirement already satisfied: astor in /usr/local/python/3.12/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (0.8.1)\n",
+      "Requirement already satisfied: jupyter in /usr/local/python/3.12/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (1.1.1)\n",
+      "Requirement already satisfied: libcst in /usr/local/python/3.12/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (1.8.6)\n",
+      "Requirement already satisfied: litellm in /usr/local/python/3.12/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (1.79.1)\n",
+      "Requirement already satisfied: matplotlib in /home/codespace/.local/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (3.10.1)\n",
+      "Requirement already satisfied: sglang in /usr/local/python/3.12/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (0.5.2)\n",
+      "Requirement already satisfied: sparklines in /usr/local/python/3.12/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (0.7.0)\n",
+      "Requirement already satisfied: tree-sitter in /usr/local/python/3.12/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (0.25.2)\n",
+      "Requirement already satisfied: tree-sitter-go in /usr/local/python/3.12/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (0.25.0)\n",
+      "Requirement already satisfied: textual in /usr/local/python/3.12/lib/python3.12/site-packages (from swesmith==0.0.4->debug-gym==1.2.0) (6.2.1)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/python/3.12/lib/python3.12/site-packages (from transformers==4.51.3->debug-gym==1.2.0) (2025.11.3)\n",
+      "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/python/3.12/lib/python3.12/site-packages (from transformers==4.51.3->debug-gym==1.2.0) (0.21.4)\n",
+      "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/python/3.12/lib/python3.12/site-packages (from transformers==4.51.3->debug-gym==1.2.0) (0.6.2)\n",
+      "Requirement already satisfied: idna>=2.8 in /home/codespace/.local/lib/python3.12/site-packages (from anyio<5,>=3.5.0->anthropic==0.51.0->debug-gym==1.2.0) (3.10)\n",
+      "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/python/3.12/lib/python3.12/site-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0->debug-gym==1.2.0) (3.13.2)\n",
+      "Requirement already satisfied: httpcore==1.* in /home/codespace/.local/lib/python3.12/site-packages (from httpx<1,>=0.25.0->anthropic==0.51.0->debug-gym==1.2.0) (1.0.9)\n",
+      "Requirement already satisfied: h11>=0.16 in /home/codespace/.local/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.25.0->anthropic==0.51.0->debug-gym==1.2.0) (0.16.0)\n",
+      "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/python/3.12/lib/python3.12/site-packages (from huggingface-hub>=0.24.0->datasets==3.6.0->debug-gym==1.2.0) (1.2.0)\n",
+      "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from pydantic<3,>=1.9.0->anthropic==0.51.0->debug-gym==1.2.0) (0.7.0)\n",
+      "Requirement already satisfied: pydantic-core==2.41.5 in /usr/local/python/3.12/lib/python3.12/site-packages (from pydantic<3,>=1.9.0->anthropic==0.51.0->debug-gym==1.2.0) (2.41.5)\n",
+      "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/python/3.12/lib/python3.12/site-packages (from pydantic<3,>=1.9.0->anthropic==0.51.0->debug-gym==1.2.0) (0.4.2)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0->debug-gym==1.2.0) (2.6.1)\n",
+      "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0->debug-gym==1.2.0) (1.4.0)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /home/codespace/.local/lib/python3.12/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0->debug-gym==1.2.0) (25.3.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/python/3.12/lib/python3.12/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0->debug-gym==1.2.0) (1.8.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/python/3.12/lib/python3.12/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0->debug-gym==1.2.0) (6.7.0)\n",
+      "Requirement already satisfied: propcache>=0.2.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0->debug-gym==1.2.0) (0.4.1)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets==3.6.0->debug-gym==1.2.0) (1.22.0)\n",
+      "Requirement already satisfied: cffi>=2.0.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from cryptography>=2.5->azure-identity==1.22.0->debug-gym==1.2.0) (2.0.0)\n",
+      "Requirement already satisfied: pycparser in /home/codespace/.local/lib/python3.12/site-packages (from cffi>=2.0.0->cryptography>=2.5->azure-identity==1.22.0->debug-gym==1.2.0) (2.22)\n",
+      "Requirement already satisfied: cachetools<7.0,>=2.0.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes==33.1.0->debug-gym==1.2.0) (6.2.1)\n",
+      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/python/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes==33.1.0->debug-gym==1.2.0) (0.4.2)\n",
+      "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/python/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes==33.1.0->debug-gym==1.2.0) (4.9.1)\n",
+      "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/python/3.12/lib/python3.12/site-packages (from rsa<5,>=3.1.4->google-auth>=1.0.1->kubernetes==33.1.0->debug-gym==1.2.0) (0.6.1)\n",
+      "Requirement already satisfied: mdurl~=0.1 in /usr/local/python/3.12/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich==14.0.0->debug-gym==1.2.0) (0.1.2)\n",
+      "Requirement already satisfied: PyJWT<3,>=1.0.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity==1.22.0->debug-gym==1.2.0) (2.10.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/codespace/.local/lib/python3.12/site-packages (from requests>=2.32.2->datasets==3.6.0->debug-gym==1.2.0) (3.4.2)\n",
+      "Requirement already satisfied: soupsieve>1.2 in /home/codespace/.local/lib/python3.12/site-packages (from beautifulsoup4->swebench==4.0.3->debug-gym==1.2.0) (2.7)\n",
+      "Requirement already satisfied: fastcore>=1.7.2 in /usr/local/python/3.12/lib/python3.12/site-packages (from ghapi->swebench==4.0.3->debug-gym==1.2.0) (1.8.15)\n",
+      "Requirement already satisfied: gitdb<5,>=4.0.1 in /home/codespace/.local/lib/python3.12/site-packages (from GitPython->swebench==4.0.3->debug-gym==1.2.0) (4.0.12)\n",
+      "Requirement already satisfied: smmap<6,>=3.0.1 in /home/codespace/.local/lib/python3.12/site-packages (from gitdb<5,>=4.0.1->GitPython->swebench==4.0.3->debug-gym==1.2.0) (5.0.2)\n",
+      "Requirement already satisfied: notebook in /usr/local/python/3.12/lib/python3.12/site-packages (from jupyter->swesmith==0.0.4->debug-gym==1.2.0) (7.4.7)\n",
+      "Requirement already satisfied: jupyter-console in /usr/local/python/3.12/lib/python3.12/site-packages (from jupyter->swesmith==0.0.4->debug-gym==1.2.0) (6.6.3)\n",
+      "Requirement already satisfied: nbconvert in /home/codespace/.local/lib/python3.12/site-packages (from jupyter->swesmith==0.0.4->debug-gym==1.2.0) (7.16.6)\n",
+      "Requirement already satisfied: ipykernel in /home/codespace/.local/lib/python3.12/site-packages (from jupyter->swesmith==0.0.4->debug-gym==1.2.0) (6.29.5)\n",
+      "Requirement already satisfied: ipywidgets in /usr/local/python/3.12/lib/python3.12/site-packages (from jupyter->swesmith==0.0.4->debug-gym==1.2.0) (8.1.8)\n",
+      "Requirement already satisfied: jupyterlab in /usr/local/python/3.12/lib/python3.12/site-packages (from jupyter->swesmith==0.0.4->debug-gym==1.2.0) (4.4.10)\n",
+      "Requirement already satisfied: comm>=0.1.1 in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.2.2)\n",
+      "Requirement already satisfied: debugpy>=1.6.5 in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (1.8.14)\n",
+      "Requirement already satisfied: ipython>=7.23.1 in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (9.2.0)\n",
+      "Requirement already satisfied: jupyter-client>=6.1.12 in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (8.6.3)\n",
+      "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (5.7.2)\n",
+      "Requirement already satisfied: matplotlib-inline>=0.1 in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.1.7)\n",
+      "Requirement already satisfied: nest-asyncio in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (1.6.0)\n",
+      "Requirement already satisfied: psutil in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (7.0.0)\n",
+      "Requirement already satisfied: pyzmq>=24 in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (26.4.0)\n",
+      "Requirement already satisfied: tornado>=6.1 in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (6.4.2)\n",
+      "Requirement already satisfied: traitlets>=5.4.0 in /home/codespace/.local/lib/python3.12/site-packages (from ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (5.14.3)\n",
+      "Requirement already satisfied: decorator in /home/codespace/.local/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (5.2.1)\n",
+      "Requirement already satisfied: ipython-pygments-lexers in /home/codespace/.local/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (1.1.1)\n",
+      "Requirement already satisfied: jedi>=0.16 in /home/codespace/.local/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.19.2)\n",
+      "Requirement already satisfied: pexpect>4.3 in /home/codespace/.local/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (4.9.0)\n",
+      "Requirement already satisfied: stack_data in /home/codespace/.local/lib/python3.12/site-packages (from ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.6.3)\n",
+      "Requirement already satisfied: parso<0.9.0,>=0.8.4 in /home/codespace/.local/lib/python3.12/site-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.8.4)\n",
+      "Requirement already satisfied: platformdirs>=2.5 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (4.3.7)\n",
+      "Requirement already satisfied: ptyprocess>=0.5 in /home/codespace/.local/lib/python3.12/site-packages (from pexpect>4.3->ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.7.0)\n",
+      "Requirement already satisfied: widgetsnbextension~=4.0.14 in /usr/local/python/3.12/lib/python3.12/site-packages (from ipywidgets->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (4.0.15)\n",
+      "Requirement already satisfied: jupyterlab_widgets~=3.0.15 in /usr/local/python/3.12/lib/python3.12/site-packages (from ipywidgets->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (3.0.16)\n",
+      "Requirement already satisfied: async-lru>=1.0.0 in /home/codespace/.local/lib/python3.12/site-packages (from jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (2.0.5)\n",
+      "Requirement already satisfied: jupyter-lsp>=2.0.0 in /home/codespace/.local/lib/python3.12/site-packages (from jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (2.2.5)\n",
+      "Requirement already satisfied: jupyter-server<3,>=2.4.0 in /home/codespace/.local/lib/python3.12/site-packages (from jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (2.15.0)\n",
+      "Requirement already satisfied: jupyterlab-server<3,>=2.27.1 in /home/codespace/.local/lib/python3.12/site-packages (from jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (2.27.3)\n",
+      "Requirement already satisfied: notebook-shim>=0.2 in /home/codespace/.local/lib/python3.12/site-packages (from jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.2.4)\n",
+      "Requirement already satisfied: setuptools>=41.1.0 in /home/codespace/.local/lib/python3.12/site-packages (from jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (80.3.1)\n",
+      "Requirement already satisfied: argon2-cffi>=21.1 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (23.1.0)\n",
+      "Requirement already satisfied: jupyter-events>=0.11.0 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.12.0)\n",
+      "Requirement already satisfied: jupyter-server-terminals>=0.4.4 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.5.3)\n",
+      "Requirement already satisfied: nbformat>=5.3.0 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (5.10.4)\n",
+      "Requirement already satisfied: overrides>=5.0 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (7.7.0)\n",
+      "Requirement already satisfied: prometheus-client>=0.9 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.21.1)\n",
+      "Requirement already satisfied: send2trash>=1.8.2 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (1.8.3)\n",
+      "Requirement already satisfied: terminado>=0.8.3 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.18.1)\n",
+      "Requirement already satisfied: babel>=2.10 in /home/codespace/.local/lib/python3.12/site-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (2.17.0)\n",
+      "Requirement already satisfied: json5>=0.9.0 in /home/codespace/.local/lib/python3.12/site-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.12.0)\n",
+      "Requirement already satisfied: jsonschema>=4.18.0 in /home/codespace/.local/lib/python3.12/site-packages (from jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (4.23.0)\n",
+      "Requirement already satisfied: argon2-cffi-bindings in /home/codespace/.local/lib/python3.12/site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (21.2.0)\n",
+      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/codespace/.local/lib/python3.12/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (2025.4.1)\n",
+      "Requirement already satisfied: referencing>=0.28.4 in /home/codespace/.local/lib/python3.12/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.36.2)\n",
+      "Requirement already satisfied: rpds-py>=0.7.1 in /home/codespace/.local/lib/python3.12/site-packages (from jsonschema>=4.18.0->jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.24.0)\n",
+      "Requirement already satisfied: python-json-logger>=2.0.4 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (3.3.0)\n",
+      "Requirement already satisfied: rfc3339-validator in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.1.4)\n",
+      "Requirement already satisfied: rfc3986-validator>=0.1.1 in /home/codespace/.local/lib/python3.12/site-packages (from jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.1.1)\n",
+      "Requirement already satisfied: fqdn in /home/codespace/.local/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (1.5.1)\n",
+      "Requirement already satisfied: isoduration in /home/codespace/.local/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (20.11.0)\n",
+      "Requirement already satisfied: jsonpointer>1.13 in /home/codespace/.local/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (3.0.0)\n",
+      "Requirement already satisfied: uri-template in /home/codespace/.local/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (1.3.0)\n",
+      "Requirement already satisfied: webcolors>=24.6.0 in /home/codespace/.local/lib/python3.12/site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (24.11.1)\n",
+      "Requirement already satisfied: bleach!=5.0.0 in /home/codespace/.local/lib/python3.12/site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (6.2.0)\n",
+      "Requirement already satisfied: defusedxml in /home/codespace/.local/lib/python3.12/site-packages (from nbconvert->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.7.1)\n",
+      "Requirement already satisfied: jupyterlab-pygments in /home/codespace/.local/lib/python3.12/site-packages (from nbconvert->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.3.0)\n",
+      "Requirement already satisfied: mistune<4,>=2.0.3 in /home/codespace/.local/lib/python3.12/site-packages (from nbconvert->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (3.1.3)\n",
+      "Requirement already satisfied: nbclient>=0.5.0 in /home/codespace/.local/lib/python3.12/site-packages (from nbconvert->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.10.2)\n",
+      "Requirement already satisfied: pandocfilters>=1.4.1 in /home/codespace/.local/lib/python3.12/site-packages (from nbconvert->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (1.5.1)\n",
+      "Requirement already satisfied: webencodings in /home/codespace/.local/lib/python3.12/site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.5.1)\n",
+      "Requirement already satisfied: tinycss2<1.5,>=1.1.0 in /home/codespace/.local/lib/python3.12/site-packages (from bleach[css]!=5.0.0->nbconvert->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (1.4.0)\n",
+      "Requirement already satisfied: fastjsonschema>=2.15 in /home/codespace/.local/lib/python3.12/site-packages (from nbformat>=5.3.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (2.21.1)\n",
+      "Requirement already satisfied: arrow>=0.15.0 in /home/codespace/.local/lib/python3.12/site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (1.3.0)\n",
+      "Requirement already satisfied: types-python-dateutil>=2.8.10 in /home/codespace/.local/lib/python3.12/site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (2.9.0.20241206)\n",
+      "Requirement already satisfied: click in /usr/local/python/3.12/lib/python3.12/site-packages (from litellm->swesmith==0.0.4->debug-gym==1.2.0) (8.3.0)\n",
+      "Requirement already satisfied: fastuuid>=0.13.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from litellm->swesmith==0.0.4->debug-gym==1.2.0) (0.14.0)\n",
+      "Requirement already satisfied: importlib-metadata>=6.8.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from litellm->swesmith==0.0.4->debug-gym==1.2.0) (8.7.0)\n",
+      "Requirement already satisfied: zipp>=3.20 in /usr/local/python/3.12/lib/python3.12/site-packages (from importlib-metadata>=6.8.0->litellm->swesmith==0.0.4->debug-gym==1.2.0) (3.23.0)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib->swesmith==0.0.4->debug-gym==1.2.0) (1.3.2)\n",
+      "Requirement already satisfied: cycler>=0.10 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib->swesmith==0.0.4->debug-gym==1.2.0) (0.12.1)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib->swesmith==0.0.4->debug-gym==1.2.0) (4.57.0)\n",
+      "Requirement already satisfied: kiwisolver>=1.3.1 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib->swesmith==0.0.4->debug-gym==1.2.0) (1.4.8)\n",
+      "Requirement already satisfied: pillow>=8 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib->swesmith==0.0.4->debug-gym==1.2.0) (11.2.1)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib->swesmith==0.0.4->debug-gym==1.2.0) (3.2.3)\n",
+      "Requirement already satisfied: cbor2 in /usr/local/python/3.12/lib/python3.12/site-packages (from modal->swebench==4.0.3->debug-gym==1.2.0) (5.7.1)\n",
+      "Requirement already satisfied: grpclib<0.4.9,>=0.4.7 in /usr/local/python/3.12/lib/python3.12/site-packages (from modal->swebench==4.0.3->debug-gym==1.2.0) (0.4.8)\n",
+      "Requirement already satisfied: protobuf!=4.24.0,<7.0,>=3.19 in /usr/local/python/3.12/lib/python3.12/site-packages (from modal->swebench==4.0.3->debug-gym==1.2.0) (6.33.0)\n",
+      "Requirement already satisfied: synchronicity~=0.10.2 in /usr/local/python/3.12/lib/python3.12/site-packages (from modal->swebench==4.0.3->debug-gym==1.2.0) (0.10.2)\n",
+      "Requirement already satisfied: toml in /usr/local/python/3.12/lib/python3.12/site-packages (from modal->swebench==4.0.3->debug-gym==1.2.0) (0.10.2)\n",
+      "Requirement already satisfied: typer>=0.9 in /usr/local/python/3.12/lib/python3.12/site-packages (from modal->swebench==4.0.3->debug-gym==1.2.0) (0.20.0)\n",
+      "Requirement already satisfied: types-certifi in /usr/local/python/3.12/lib/python3.12/site-packages (from modal->swebench==4.0.3->debug-gym==1.2.0) (2021.10.8.3)\n",
+      "Requirement already satisfied: types-toml in /usr/local/python/3.12/lib/python3.12/site-packages (from modal->swebench==4.0.3->debug-gym==1.2.0) (0.10.8.20240310)\n",
+      "Requirement already satisfied: watchfiles in /usr/local/python/3.12/lib/python3.12/site-packages (from modal->swebench==4.0.3->debug-gym==1.2.0) (1.1.1)\n",
+      "Requirement already satisfied: h2<5,>=3.1.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from grpclib<0.4.9,>=0.4.7->modal->swebench==4.0.3->debug-gym==1.2.0) (4.3.0)\n",
+      "Requirement already satisfied: hyperframe<7,>=6.1 in /usr/local/python/3.12/lib/python3.12/site-packages (from h2<5,>=3.1.0->grpclib<0.4.9,>=0.4.7->modal->swebench==4.0.3->debug-gym==1.2.0) (6.1.0)\n",
+      "Requirement already satisfied: hpack<5,>=4.1 in /usr/local/python/3.12/lib/python3.12/site-packages (from h2<5,>=3.1.0->grpclib<0.4.9,>=0.4.7->modal->swebench==4.0.3->debug-gym==1.2.0) (4.1.0)\n",
+      "Requirement already satisfied: sigtools>=4.0.1 in /usr/local/python/3.12/lib/python3.12/site-packages (from synchronicity~=0.10.2->modal->swebench==4.0.3->debug-gym==1.2.0) (4.0.1)\n",
+      "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from typer>=0.9->modal->swebench==4.0.3->debug-gym==1.2.0) (1.5.4)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /home/codespace/.local/lib/python3.12/site-packages (from pandas->datasets==3.6.0->debug-gym==1.2.0) (2025.2)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /home/codespace/.local/lib/python3.12/site-packages (from pandas->datasets==3.6.0->debug-gym==1.2.0) (2025.2)\n",
+      "Requirement already satisfied: cfgv>=2.0.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from pre-commit->swebench==4.0.3->debug-gym==1.2.0) (3.4.0)\n",
+      "Requirement already satisfied: identify>=1.0.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from pre-commit->swebench==4.0.3->debug-gym==1.2.0) (2.6.15)\n",
+      "Requirement already satisfied: nodeenv>=0.11.1 in /usr/local/python/3.12/lib/python3.12/site-packages (from pre-commit->swebench==4.0.3->debug-gym==1.2.0) (1.9.1)\n",
+      "Requirement already satisfied: virtualenv>=20.10.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from pre-commit->swebench==4.0.3->debug-gym==1.2.0) (20.35.4)\n",
+      "Requirement already satisfied: distlib<1,>=0.3.7 in /usr/local/python/3.12/lib/python3.12/site-packages (from virtualenv>=20.10.0->pre-commit->swebench==4.0.3->debug-gym==1.2.0) (0.4.0)\n",
+      "Requirement already satisfied: setproctitle in /usr/local/python/3.12/lib/python3.12/site-packages (from sglang->swesmith==0.0.4->debug-gym==1.2.0) (1.3.7)\n",
+      "Requirement already satisfied: executing>=1.2.0 in /home/codespace/.local/lib/python3.12/site-packages (from stack_data->ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (2.2.0)\n",
+      "Requirement already satisfied: asttokens>=2.1.0 in /home/codespace/.local/lib/python3.12/site-packages (from stack_data->ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (3.0.0)\n",
+      "Requirement already satisfied: pure-eval in /home/codespace/.local/lib/python3.12/site-packages (from stack_data->ipython>=7.23.1->ipykernel->jupyter->swesmith==0.0.4->debug-gym==1.2.0) (0.2.3)\n",
+      "Requirement already satisfied: linkify-it-py<3,>=1 in /usr/local/python/3.12/lib/python3.12/site-packages (from markdown-it-py[linkify,plugins]>=2.1.0->textual->swesmith==0.0.4->debug-gym==1.2.0) (2.0.3)\n",
+      "Requirement already satisfied: mdit-py-plugins>=0.5.0 in /usr/local/python/3.12/lib/python3.12/site-packages (from markdown-it-py[linkify,plugins]>=2.1.0->textual->swesmith==0.0.4->debug-gym==1.2.0) (0.5.0)\n",
+      "Requirement already satisfied: uc-micro-py in /usr/local/python/3.12/lib/python3.12/site-packages (from linkify-it-py<3,>=1->markdown-it-py[linkify,plugins]>=2.1.0->textual->swesmith==0.0.4->debug-gym==1.2.0) (1.0.3)\n",
+      "Building wheels for collected packages: debug-gym\n",
+      "  Building editable for debug-gym (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for debug-gym: filename=debug_gym-1.2.0-0.editable-py3-none-any.whl size=11796 sha256=135124752cbe550a31e2d4f5709e27d52179a8c93d42f55297becbcafc8f64d0\n",
+      "  Stored in directory: /tmp/pip-ephem-wheel-cache-09yb2sfz/wheels/41/1a/e2/7dc55b8ab794d2e471cef23dc2b4ca8d320978ce3d6ca9dcfe\n",
+      "Successfully built debug-gym\n",
+      "Installing collected packages: debug-gym\n",
+      "  Attempting uninstall: debug-gym\n",
+      "    Found existing installation: debug-gym 1.2.0\n",
+      "    Uninstalling debug-gym-1.2.0:\n",
+      "      Successfully uninstalled debug-gym-1.2.0\n",
+      "Successfully installed debug-gym-1.2.0\n"
+     ]
+    }
+   ],
    "source": [
     "# In codespace you can simply install the package directly from the notebook.\n",
     "!pip install -e .."
@@ -85,9 +318,7 @@
     "from debug_gym.gym.envs import MiniNightmareEnv\n",
     "from debug_gym.logger import DebugGymLogger\n",
     "\n",
-    "# For the sake of this tutorial, we disable the logger to avoid cluttering the output.\n",
-    "logger = DebugGymLogger(\"debug-gym\", level=\"ERROR\")\n",
-    "logger.disabled = True\n",
+    "logger = DebugGymLogger(\"debug-gym\")\n",
     "\n",
     "# Initialize the MiniNightmare benchmark environment.\n",
     "# This will download the dataset and set up the environment.\n",
@@ -122,17 +353,10 @@
       "======================================================================\n",
       "                      DEBUG GYM ENVIRONMENT INFO                      \n",
       "======================================================================\n",
-      "📊 Status: 🔄 (IN PROGRESS)\t🎯 Score: 0/1\t✏️ Rewrites: 0\n",
+      "📊 Status: 🔄 (IN PROGRESS)\t🎯 Score: 0/?\t✏️ Rewrites: 0\n",
       "👁️ Observation:\n",
       "```\n",
-      "============================= test session starts ==============================\n",
-      "collected 1 item\n",
-      "\n",
-      "test.py F\n",
-      "\n",
-      "=========================== short test summary info ============================\n",
-      "FAILED test.py::test_calculate_stats - KeyError: 'Price'\n",
-      "============================== 1 failed in 0.39s ===============================\n",
+      "The program doesn't behave as intended. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue. Beaware that the bug may not be in the code you initially see.\n",
       "```\n",
       "\n",
       "🛠️  Available Tools (0):\n",
@@ -143,7 +367,8 @@
       "\n",
       "📁 Directory Structure:\n",
       "   Listing files in the current working directory. (read-only) indicates read-only files. Max depth: 1.\n",
-      "   /tmp/RepoEnv-hk6i04hj/\n",
+      "   /testbed/\n",
+      "   |-- .git/\n",
       "   |-- pandas_dataframe_code.py\n",
       "   |-- test.py (read-only)\n",
       "============================================================\n"
@@ -189,14 +414,7 @@
       "📊 Status: 🔄 (IN PROGRESS)\t🎯 Score: 0/1\t✏️ Rewrites: 0\n",
       "👁️ Observation:\n",
       "```\n",
-      "============================= test session starts ==============================\n",
-      "collected 1 item\n",
-      "\n",
-      "test.py F\n",
-      "\n",
-      "=========================== short test summary info ============================\n",
-      "FAILED test.py::test_calculate_stats - KeyError: 'Price'\n",
-      "============================== 1 failed in 0.39s ===============================\n",
+      "The program doesn't behave as intended. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue. Beaware that the bug may not be in the code you initially see.\n",
       "```\n",
       "\n",
       "🛠️  Available Tools (3):\n",
@@ -209,7 +427,8 @@
       "\n",
       "📁 Directory Structure:\n",
       "   Listing files in the current working directory. (read-only) indicates read-only files. Max depth: 1.\n",
-      "   /tmp/RepoEnv-32lac0qw/\n",
+      "   /testbed/\n",
+      "   |-- .git/\n",
       "   |-- pandas_dataframe_code.py\n",
       "   |-- test.py (read-only)\n",
       "============================================================\n"
@@ -217,7 +436,6 @@
     }
    ],
    "source": [
-    "\n",
     "from debug_gym.gym.tools.toolbox import Toolbox\n",
     "\n",
     "for tool in  [\"view\", \"rewrite\", \"eval\"]:\n",
@@ -243,13 +461,6 @@
    "id": "3b7b80ae",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
@@ -266,7 +477,7 @@
     "from debug_gym.llms.base import LLM, LLMConfig\n",
     "from debug_gym.llms import OpenAILLM\n",
     "\n",
-    "MODEL = \"gpt-4.1\"  # gpt-5 tool calling seems broken at the moment 🤷.\n",
+    "MODEL = \"gpt-4.1\"\n",
     "llm_config = LLMConfig(\n",
     "    model=MODEL,\n",
     "    context_limit=128,\n",
@@ -299,7 +510,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "5645e9fa",
    "metadata": {},
    "outputs": [
@@ -318,6 +529,110 @@
       "==================== STEP 1 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
+      "\u001b[35mThe program doesn't behave as intended. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue. Beaware that the bug may not be in the code you initially see.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_nPnaZhXXnlW3MbwCjv6wGlhb', name='view', arguments={'path': 'main.py'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 2 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`main.py` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_PtzbHl4TKwtRvzO79wH91hrR', name='view', arguments={'path': 'app.py'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 3 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`app.py` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_sFcQVxdRILsGD4KmFkQ1U7xN', name='view', arguments={'path': 'src/main.py'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 4 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`src/main.py` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_JRpBtSBrSw4aaYOm9eSrNFwE', name='view', arguments={'path': 'src/app.py'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 5 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`src/app.py` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_zXNiEzxKJX8iU4YGmVbJ6iq9', name='view', arguments={'path': 'tests.py'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 6 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`tests.py` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_B8jMF9rKoOnaqJZIPTGnQfV7', name='view', arguments={'path': 'test_main.py'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 7 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`test_main.py` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_8KOKyHzwAJGWp1zyznHbpxtj', name='view', arguments={'path': 'test_app.py'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 8 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`test_app.py` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mWe are not sure about the filename. Let's list the files in the directory to see what code files are available.\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_RReRIWLJ9bc29VQjWFqbm2V3', name='view', arguments={'path': '.', 'start': None, 'end': None, 'include_line_numbers_and_breakpoints': False})\u001b[0m\n",
+      "\n",
+      "==================== STEP 9 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mFailed to run command `cat /testbed`:\n",
+      "cat: /testbed: Is a directory\n",
+      "\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_DC6CzudUPTkcYe28XgcmU5x2', name='view', arguments={'path': 'solution.py'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 10 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`solution.py` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_d2LyQ6uWtVDcQaosz9irYC3x', name='view', arguments={'path': 'code.py'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 11 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`code.py` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_wLmBIRv34gxKZ0bP7PEnO2Zh', name='view', arguments={'path': 'main', 'start': None, 'end': None, 'include_line_numbers_and_breakpoints': True})\u001b[0m\n",
+      "\n",
+      "==================== STEP 12 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`main` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mThere might be subdirectories or other file naming conventions. To move forward, I need to see what files exist in this repository. I'll try looking for a README.md file, which often exists and can provide clues.\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_5fAs1yKD9uUAn8P8sVjRR0bk', name='view', arguments={'path': 'README.md'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 13 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mView failed. Error message:\n",
+      "`README.md` does not exist or is not in the working directory `/testbed`.\u001b[0m\n",
+      "\u001b[36mIt appears that I cannot directly detect the available files. Since I cannot list files or detect files with generic names, I should attempt to run the test suite (if present) to gather information, such as error output or traceback that will often mention the file names and line numbers.\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_7k8hztkdmLzA9lHJ30e8UPtB', name='eval', arguments={})\u001b[0m\n",
+      "\n",
+      "==================== STEP 14 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
       "\u001b[35m============================= test session starts ==============================\n",
       "collected 1 item\n",
       "\n",
@@ -325,19 +640,11 @@
       "\n",
       "=========================== short test summary info ============================\n",
       "FAILED test.py::test_calculate_stats - KeyError: 'Price'\n",
-      "============================== 1 failed in 0.41s ===============================\u001b[0m\n",
-      "\u001b[36mThe test failed with a KeyError for 'Price', which suggests the code attempted to access a dictionary key or dataframe column called 'Price' that doesn't exist. Let's check the code in the relevant file where this key is accessed. Assuming it's in the main implementation file, let's view the first 30 lines of main.py to investigate.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_eRR0MxIb67pfvFRz0o2qemN6', name='view', arguments={'path': 'main.py', 'start': 1, 'end': 30, 'include_line_numbers_and_breakpoints': True})\u001b[0m\n",
-      "\n",
-      "==================== STEP 2 ====================\n",
-      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
-      "\n",
-      "\u001b[35mView failed. Error message:\n",
-      "`main.py` does not exist or is not in the working directory `/tmp/RepoEnv-wmbm3sgj`.\u001b[0m\n",
-      "\u001b[36mThere's no main.py file. Since the error is about a KeyError: 'Price', let's check the contents of test.py instead. The test might show which file or function is being tested and what is expected, guiding us to the file to inspect.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_meMCEf3wtoMOP1dIE1qN0lej', name='view', arguments={'path': 'test.py', 'start': 1, 'end': 30, 'include_line_numbers_and_breakpoints': True})\u001b[0m\n",
+      "============================== 1 failed in 0.51s ===============================\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_hEcuzZDRGBOPiG96eXtliAmN', name='view', arguments={'path': 'test.py'})\u001b[0m\n",
       "\n",
-      "==================== STEP 3 ====================\n",
+      "==================== STEP 15 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mViewing `test.py`, lines 1-8 of 8 total lines. The file is read-only.\n",
@@ -354,12 +661,10 @@
       "```\n",
       "\n",
       "\u001b[0m\n",
-      "\u001b[36mThe functions being tested are `calculate_stats` and `load_data` from `pandas_dataframe_code`. Since the error was a KeyError for 'Price' and the test involves a DataFrame, we should inspect `pandas_dataframe_code.py` to see how these functions handle DataFrame columns and where 'Price' is referenced.\n",
-      "\n",
-      "Let's view the first 30 lines of `pandas_dataframe_code.py`.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_YBDpxvk7tMeuHQ0hKRmjRCu9', name='view', arguments={'path': 'pandas_dataframe_code.py', 'start': 1, 'end': 30, 'include_line_numbers_and_breakpoints': True})\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_3yG6yynFgYSpzSCC6mypz5QD', name='view', arguments={'path': 'pandas_dataframe_code.py'})\u001b[0m\n",
       "\n",
-      "==================== STEP 4 ====================\n",
+      "==================== STEP 16 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mViewing `pandas_dataframe_code.py`, lines 1-17 of 17 total lines.\n",
@@ -385,12 +690,12 @@
       "```\n",
       "\n",
       "\u001b[0m\n",
-      "\u001b[36mIn `calculate_stats`, the code uses `df['Price']`. However, the Titanic dataset usually uses the column name 'Fare' for the ticket price, not 'Price'. This discrepancy likely causes the KeyError.\n",
+      "\u001b[36mThe root cause of the bug is in `pandas_dataframe_code.py` at line 15: the code tries to access the column `'Price'`, but the Titanic dataset typically uses the column name `'Fare'` for the fare paid. This results in a `KeyError: 'Price'`.\n",
       "\n",
-      "I will update line 15 in `pandas_dataframe_code.py` from `df['Price'].median()` to `df['Fare'].median()`.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_wz2JYVPKEG4fU92951S1zd3I', name='rewrite', arguments={'path': 'pandas_dataframe_code.py', 'start': 15, 'end': 15, 'new_code': \"    median = df['Fare'].median()\"})\u001b[0m\n",
+      "To fix this, I will change `'Price'` to `'Fare'` in the `calculate_stats` function.\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_0XgmwOnKM1k7gC4JgZXNVgjV', name='rewrite', arguments={'path': 'pandas_dataframe_code.py', 'start': 15, 'end': 15, 'new_code': \"    median = df['Fare'].median()\"})\u001b[0m\n",
       "\n",
-      "==================== STEP 5 ====================\n",
+      "==================== STEP 17 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mThe file `pandas_dataframe_code.py` has been updated successfully.\n",
@@ -409,9 +714,9 @@
       "     return median\n",
       "\u001b[0m\n",
       "\u001b[36mNone\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_pt5g6Tki2qBI8XTCTekt7cYL', name='eval', arguments={})\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_rdERG3jtLItXMTSdVpw28eq0', name='eval', arguments={})\u001b[0m\n",
       "\n",
-      "==================== STEP 6 ====================\n",
+      "==================== STEP 18 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35m============================= test session starts ==============================\n",
@@ -422,12 +727,10 @@
       "=========================== short test summary info ============================\n",
       "FAILED test.py::test_calculate_stats - KeyError: 'Fare'\n",
       "============================== 1 failed in 0.41s ===============================\u001b[0m\n",
-      "\u001b[36mChanging 'Price' to 'Fare' did not resolve the KeyError, which means the loaded DataFrame does not have a 'Fare' column either. To diagnose further, let's inspect the actual columns of the DataFrame returned by `load_data()`. I will review a few lines from the Titanic CSV's initial rows to check the column names—this could be in the dataset URL or we can do it by looking at columns right after loading.\n",
-      "\n",
-      "Let's add a print statement to display the DataFrame columns right after reading the CSV in `load_data()`.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_BbhwlQs6exgbi3vpbLaUiGKP', name='rewrite', arguments={'path': 'pandas_dataframe_code.py', 'start': 10, 'end': 10, 'new_code': \"    print('Loaded columns:', df.columns.tolist())\"})\u001b[0m\n",
+      "\u001b[36mEven after updating to 'Fare', a KeyError persists. This suggests the column name might be different or the dataset's schema isn't as expected. To debug further, I should inspect the columns of the downloaded DataFrame in `load_data()` to see the actual column names. I'll add a print statement to print the `df.columns` in the function.\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_l38oFO0cO4L0ABJ1bg184vEt', name='rewrite', arguments={'path': 'pandas_dataframe_code.py', 'start': 11, 'end': 11, 'new_code': '    print(df.columns)'})\u001b[0m\n",
       "\n",
-      "==================== STEP 7 ====================\n",
+      "==================== STEP 19 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mThe file `pandas_dataframe_code.py` has been updated successfully.\n",
@@ -436,36 +739,36 @@
       "\n",
       "--- original\n",
       "+++ current\n",
-      "@@ -7,7 +7,7 @@\n",
-      " \n",
+      "@@ -8,7 +8,7 @@\n",
       "     # Read the CSV file\n",
       "     df = pd.read_csv(url)\n",
-      "-\n",
-      "+    print('Loaded columns:', df.columns.tolist())\n",
-      "     return df\n",
+      " \n",
+      "-    return df\n",
+      "+    print(df.columns)\n",
       " \n",
       " def calculate_stats(df):\n",
+      "     # Calculate the median fare paid\n",
       "\u001b[0m\n",
       "\u001b[36mNone\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_Hq3kEkfuEWkJgYCUQobKL4WR', name='eval', arguments={})\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_XWXnxjxJr7JpcEeQRzW3aF4a', name='eval', arguments={})\u001b[0m\n",
       "\n",
-      "==================== STEP 8 ====================\n",
+      "==================== STEP 20 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35m============================= test session starts ==============================\n",
       "collected 1 item\n",
       "\n",
-      "test.py Loaded columns: ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'prix', 'cabin', 'embarked', 'boat', 'body', 'home.dest']\n",
+      "test.py Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',\n",
+      "       'prix', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],\n",
+      "      dtype='object')\n",
       "F\n",
       "\n",
       "=========================== short test summary info ============================\n",
-      "FAILED test.py::test_calculate_stats - KeyError: 'Fare'\n",
-      "============================== 1 failed in 0.40s ===============================\u001b[0m\n",
-      "\u001b[36mThe loaded columns include 'prix' but not 'Fare' or 'Price'. 'prix' is the French word for 'price', which implies this is the intended column for the fare/price.\n",
-      "\n",
-      "I will update `calculate_stats` to use `df['prix']` instead of `df['Fare']`.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_hzm5UnlMtf1q7Xiidqo5cdNI', name='rewrite', arguments={'path': 'pandas_dataframe_code.py', 'start': 15, 'end': 15, 'new_code': \"    median = df['prix'].median()\"})\u001b[0m\n",
-      "Step: 8 | Score: 1/1 (100.0%) | Reason: bug fixed\n"
+      "FAILED test.py::test_calculate_stats - TypeError: 'NoneType' object is not su...\n",
+      "============================== 1 failed in 0.46s ===============================\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_ioQSMq9JNCWKaxVl90bcFbqz', name='view', arguments={'path': 'pandas_dataframe_code.py', 'start': 9, 'end': 13})\u001b[0m\n",
+      "Step: 20 | Score: 0/1 (0.0%) | Reason: max steps reached\n"
      ]
     }
    ],
@@ -475,6 +778,9 @@
     "\n",
     "from debug_gym.agents.history_tracker import HistoryTracker, build_history_prompt\n",
     "\n",
+    "# For the sake of this tutorial, we disable the logger to avoid cluttering the output.\n",
+    "logger.disabled = True\n",
+    "\n",
     "MAX_STEPS = 20  # Maximum number of steps to run the agent.\n",
     "MEMORY_SIZE = 20  # Size of the history tracker.\n",
     "TASK_NAME = \"pandas_dataframe\"\n",
@@ -565,14 +871,21 @@
       "- view(path, start, end, include_line_numbers_and_breakpoints): Specify a file path to view its content.\n",
       "- rewrite(path, start, end, new_code): Rewrite the content of the specified file path, between lines [start, end], with the new code.\n",
       "- eval(): Evaluate the current code against pre-defined test cases.\n",
-      "- pdb(command): An interface to the Python debugger PDB.\n",
+      "- pdb(command, entrypoint): An interface to the Python debugger PDB.\n",
       "\n",
       "System prompt:\n",
-      "\u001b[33mYour goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools, you can use them to investigate the code and propose a rewriting patch to fix the bugs. Avoid rewriting the entire code, focus on the bugs only. At every step, you have to use one of the tools via function calling. You can only call one tool at a time. Do not repeat your previous action unless they can provide more information. You can think step by step to help you make the decision at every step, but you must be concise and avoid overthinking. Output both your thinking process (if any) and the tool call in the response. Do not use rewrite for adding print statement, use the pdb tool instead.\u001b[0m\n",
+      "\u001b[33mYour goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools, you can use them to investigate the code and propose a rewriting patch to fix the bugs. Avoid rewriting the entire code, focus on the bugs only. At every step, you have to use one of the tools via function calling. You can only call one tool at a time. Do not repeat your previous action unless they can provide more information. You can think step by step to help you make the decision at every step, but you must be concise and avoid overthinking. Output both your thinking process (if any) and the tool call in the response. Do not use rewrite for adding print statement, use the pdb tool instead without specifying any entrypoint.\u001b[0m\n",
       "\n",
       "==================== STEP 1 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
+      "\u001b[35mThe program doesn't behave as intended. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue. Beaware that the bug may not be in the code you initially see.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_dqZ8aRRXKOO2A9QddIVKdmfP', name='eval', arguments={})\u001b[0m\n",
+      "\n",
+      "==================== STEP 2 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
       "\u001b[35m============================= test session starts ==============================\n",
       "collected 1 item\n",
       "\n",
@@ -580,13 +893,11 @@
       "\n",
       "=========================== short test summary info ============================\n",
       "FAILED test.py::test_calculate_stats - KeyError: 'Price'\n",
-      "============================== 1 failed in 0.41s ===============================\u001b[0m\n",
-      "\u001b[36mThe error message indicates there's a KeyError: 'Price' likely when accessing a dictionary (perhaps a DataFrame). Let's check the code in 'test.py' first to see the test and spot how 'Price' is accessed.\n",
-      "\n",
-      "I'll view the first 20 lines of 'test.py' to locate the test and any references to 'Price'.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_begvRVC6NlzNJZHmz0GGni0V', name='view', arguments={'path': 'test.py', 'start': 1, 'end': 20})\u001b[0m\n",
+      "============================== 1 failed in 0.40s ===============================\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_frWgHgSh1cBDBU86oDKrvY26', name='view', arguments={'path': 'test.py'})\u001b[0m\n",
       "\n",
-      "==================== STEP 2 ====================\n",
+      "==================== STEP 3 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mViewing `test.py`, lines 1-8 of 8 total lines. The file is read-only.\n",
@@ -603,12 +914,10 @@
       "```\n",
       "\n",
       "\u001b[0m\n",
-      "\u001b[36mThe file `test.py` calls `load_data()` and then passes its result (a DataFrame) to `calculate_stats(df)`. The KeyError: 'Price' must be coming from one of those two functions. \n",
-      "\n",
-      "To find out how 'Price' is being accessed, I should next inspect the implementations of `load_data` and `calculate_stats` in `pandas_dataframe_code.py`. I will view the first 20 lines of `pandas_dataframe_code.py` to locate these definitions and see how the 'Price' key is being used.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_4mPKpe43r2rLDNZ9Bnwt1P6N', name='view', arguments={'path': 'pandas_dataframe_code.py', 'start': 1, 'end': 20})\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_sJ87HhlUdlwV6FDzGAebeHYn', name='view', arguments={'path': 'pandas_dataframe_code.py'})\u001b[0m\n",
       "\n",
-      "==================== STEP 3 ====================\n",
+      "==================== STEP 4 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mViewing `pandas_dataframe_code.py`, lines 1-17 of 17 total lines.\n",
@@ -634,14 +943,12 @@
       "```\n",
       "\n",
       "\u001b[0m\n",
-      "\u001b[36mThe code reads the Titanic dataset from the provided URL and then tries to access the 'Price' column (`df['Price']`). However, in the Titanic dataset the column is typically named 'Fare', not 'Price'. \n",
+      "\u001b[36mThe test fails with a KeyError: 'Price'. Looking at the code, calculate_stats tries to access `df['Price']`, but the Titanic dataset's fare column is typically named 'Fare', not 'Price'. This is likely the root cause.\n",
       "\n",
-      "To fix the KeyError, I should change 'Price' to 'Fare' in the `calculate_stats` function.\n",
+      "I will fix line 15 in `pandas_dataframe_code.py` to use 'Fare' instead of 'Price'.\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_FRbHDtlR0Gnk9Qr7eDkvaVyq', name='rewrite', arguments={'path': 'pandas_dataframe_code.py', 'start': 15, 'end': 15, 'new_code': \"    median = df['Fare'].median()\"})\u001b[0m\n",
       "\n",
-      "I'll make this correction now.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_1zHwHnjYc4KSb86IwmshtW7W', name='rewrite', arguments={'path': 'pandas_dataframe_code.py', 'start': 15, 'end': 15, 'new_code': \"    median = df['Fare'].median()\"})\u001b[0m\n",
-      "\n",
-      "==================== STEP 4 ====================\n",
+      "==================== STEP 5 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mThe file `pandas_dataframe_code.py` has been updated successfully.\n",
@@ -660,9 +967,9 @@
       "     return median\n",
       "\u001b[0m\n",
       "\u001b[36mNone\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_DmgnpmbteRLYEoSX59ZgUACQ', name='eval', arguments={})\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_2Nu3OYenwgMwqRZYFZDURJN4', name='eval', arguments={})\u001b[0m\n",
       "\n",
-      "==================== STEP 5 ====================\n",
+      "==================== STEP 6 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35m============================= test session starts ==============================\n",
@@ -672,20 +979,28 @@
       "\n",
       "=========================== short test summary info ============================\n",
       "FAILED test.py::test_calculate_stats - KeyError: 'Fare'\n",
-      "============================== 1 failed in 0.40s ===============================\u001b[0m\n",
-      "\u001b[36mIt seems the column is still not found—KeyError: 'Fare'—even after changing the key name. This hints that the actual column name might be something else, or perhaps there is a whitespace issue or different capitalization.\n",
+      "============================== 1 failed in 0.38s ===============================\u001b[0m\n",
+      "\u001b[36mChanging 'Price' to 'Fare' did not work—the KeyError is now for 'Fare'. This suggests the column in the dataset has a different name. I need to inspect the actual columns in the DataFrame returned by `load_data()`.\n",
       "\n",
-      "To verify the exact columns present in the DataFrame, I'll add a PDB breakpoint right before the 'Fare' access in `calculate_stats`, and inspect the column names.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_w0cswEtDKc8bSmupynXuY3YO', name='pdb', arguments={'command': 'b pandas_dataframe_code.py:15'})\u001b[0m\n",
+      "I'll add a pdb breakpoint inside `calculate_stats` before median calculation and print the dataframe columns.\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_vLRjkOZsdcsXfMY5AfXFKWwl', name='pdb', arguments={'command': 'b pandas_dataframe_code.py:14'})\u001b[0m\n",
       "\n",
-      "==================== STEP 6 ====================\n",
+      "==================== STEP 7 ====================\n",
+      "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
+      "\n",
+      "\u001b[35mInvalid pdb command: b pandas_dataframe_code.py:14\n",
+      "Invalid line number: *** Blank or comment.\u001b[0m\n",
+      "\u001b[36mNone\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_v8pi9CEgcmZyD3c7xooSirPw', name='pdb', arguments={'command': 'b pandas_dataframe_code.py:15'})\u001b[0m\n",
+      "\n",
+      "==================== STEP 8 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mPdb command output:\n",
-      "Breakpoint 1 at /tmp/RepoEnv-pzzs7o6s/pandas_dataframe_code.py:15\n",
+      "Breakpoint 1 at /testbed/pandas_dataframe_code.py:15\n",
       "\n",
       "Current frame:\n",
-      "/home/macote/src/debug-gym/.venv/lib/python3.12/site-packages/pytest/__main__.py\n",
+      "/usr/local/lib/python3.12/site-packages/pytest/__main__.py\n",
       "\n",
       "Context around the current frame:\n",
       "  1  ->\t\"\"\"The pytest entry point.\"\"\"\n",
@@ -700,23 +1015,22 @@
       "[EOF]\n",
       "\u001b[0m\n",
       "\u001b[36mNone\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_tjblDGlTzY3QWdKEysPiCATc', name='pdb', arguments={'command': 'c'})\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_RyXDTMkYzJEf2CP0NL5tVa70', name='pdb', arguments={'command': 'c'})\u001b[0m\n",
       "\n",
-      "==================== STEP 7 ====================\n",
+      "==================== STEP 9 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mPdb command output:\n",
       "============================= test session starts ==============================\n",
-      "platform linux -- Python 3.12.3, pytest-8.4.1, pluggy-1.6.0\n",
-      "rootdir: /tmp/RepoEnv-pzzs7o6s\n",
-      "plugins: anyio-4.10.0\n",
+      "platform linux -- Python 3.12.12, pytest-8.4.2, pluggy-1.6.0\n",
+      "rootdir: /testbed\n",
       "collected 1 item                                                               \n",
       "\n",
-      "test.py > /tmp/RepoEnv-pzzs7o6s/pandas_dataframe_code.py(15)calculate_stats()\n",
+      "test.py > /testbed/pandas_dataframe_code.py(15)calculate_stats()\n",
       "-> median = df['Fare'].median()\n",
       "\n",
       "Current frame:\n",
-      "/tmp/RepoEnv-pzzs7o6s/pandas_dataframe_code.py\n",
+      "/testbed/pandas_dataframe_code.py\n",
       "\n",
       "Context around the current frame:\n",
       " 10  \t\n",
@@ -730,9 +1044,9 @@
       "[EOF]\n",
       "\u001b[0m\n",
       "\u001b[36mNone\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_SUK8QD60uW9Q78ORFZP5bB1d', name='pdb', arguments={'command': 'p df.columns'})\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_fQJNc1V8ENb1ETm7lOKszoGr', name='pdb', arguments={'command': 'p df.columns'})\u001b[0m\n",
       "\n",
-      "==================== STEP 8 ====================\n",
+      "==================== STEP 10 ====================\n",
       "Nb. of tests passed:    0/1    (0.0%) [Best: 0]\n",
       "\n",
       "\u001b[35mPdb command output:\n",
@@ -741,7 +1055,7 @@
       "      dtype='object')\n",
       "\n",
       "Current frame:\n",
-      "/tmp/RepoEnv-pzzs7o6s/pandas_dataframe_code.py\n",
+      "/testbed/pandas_dataframe_code.py\n",
       "\n",
       "Context around the current frame:\n",
       " 10  \t\n",
@@ -754,11 +1068,20 @@
       " 17  \t    return median\n",
       "[EOF]\n",
       "\u001b[0m\n",
-      "\u001b[36mThe actual column in the DataFrame that represents the fare is named 'prix', which is French for 'price'. The function should use 'prix' instead of 'Fare' or 'Price'.\n",
+      "\u001b[36mThe column corresponding to the fare is labeled as 'prix', not 'Fare' or 'Price'. To fix the error, I need to update the code to use 'prix' as the column name.\n",
       "\n",
-      "I will update the code to access 'prix' instead.\u001b[0m\n",
-      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_hmXldtZWWtPuZW80ZH78k0cX', name='rewrite', arguments={'path': 'pandas_dataframe_code.py', 'start': 15, 'end': 15, 'new_code': \"    median = df['prix'].median()\"})\u001b[0m\n",
-      "Step: 8 | Score: 1/1 (100.0%) | Reason: bug fixed\n"
+      "I will rewrite the code at line 15 in pandas_dataframe_code.py:\n",
+      "\n",
+      "Replace:\n",
+      "```python\n",
+      "median = df['Fare'].median()\n",
+      "```\n",
+      "with:\n",
+      "```python\n",
+      "median = df['prix'].median()\n",
+      "```\u001b[0m\n",
+      "\u001b[1m\u001b[36mTool call: ToolCall(id='call_cY0aeFA4CJgQ94ghUWQvrazG', name='rewrite', arguments={'path': 'pandas_dataframe_code.py', 'start': 15, 'end': 15, 'new_code': \"    median = df['prix'].median()\"})\u001b[0m\n",
+      "Step: 10 | Score: 1/1 (100.0%) | Reason: bug fixed\n"
      ]
     }
    ],
@@ -766,7 +1089,7 @@
     "if not env.has_tool(\"pdb\"):\n",
     "    env.add_tool(Toolbox.get_tool(\"pdb\"))\n",
     "    SYSTEM_PROMPT += (\n",
-    "        \" Do not use rewrite for adding print statement, use the pdb tool instead.\"\n",
+    "        \" Do not use rewrite for adding print statement, use the pdb tool instead without specifying any entrypoint.\"\n",
     "    )\n",
     "\n",
     "history = run()"
@@ -786,7 +1109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "88634798",
    "metadata": {},
    "outputs": [],
@@ -808,7 +1131,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "e715fb88",
    "metadata": {},
    "outputs": [
@@ -821,14 +1144,23 @@
       "\u001b[31m\u001b[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.\u001b[0m\n",
       " * Running on all addresses (0.0.0.0)\n",
       " * Running on http://127.0.0.1:5000\n",
-      " * Running on http://10.209.224.183:5000\n",
+      " * Running on http://10.0.11.199:5000\n",
       "\u001b[33mPress CTRL+C to quit\u001b[0m\n",
-      "^C\n"
+      "127.0.0.1 - - [06/Nov/2025 18:25:16] \"\u001b[32mGET / HTTP/1.1\u001b[0m\" 302 -\n",
+      "127.0.0.1 - - [06/Nov/2025 18:25:17] \"GET /upload HTTP/1.1\" 200 -\n",
+      "127.0.0.1 - - [06/Nov/2025 18:25:18] \"GET /browse_directory HTTP/1.1\" 200 -\n",
+      "127.0.0.1 - - [06/Nov/2025 18:25:22] \"GET /load_file_from_path?path=/workspaces/debug-gym/notebooks/tutorial.jsonl HTTP/1.1\" 200 -\n",
+      "127.0.0.1 - - [06/Nov/2025 18:25:23] \"GET / HTTP/1.1\" 200 -\n",
+      "127.0.0.1 - - [06/Nov/2025 18:25:24] \"GET /get_step/0 HTTP/1.1\" 200 -\n",
+      "127.0.0.1 - - [06/Nov/2025 18:25:42] \"GET /get_step/4 HTTP/1.1\" 200 -\n",
+      "127.0.0.1 - - [06/Nov/2025 18:26:02] \"GET /get_step/8 HTTP/1.1\" 200 -\n",
+      "127.0.0.1 - - [06/Nov/2025 18:26:04] \"GET /get_step/9 HTTP/1.1\" 200 -\n",
+      "127.0.0.1 - - [06/Nov/2025 18:26:12] \"GET /statistics HTTP/1.1\" 200 -\n"
      ]
     }
    ],
    "source": [
-    "\n",
+    "# !pip install flask\n",
     "# Run the viewer.\n",
     "!python ../analysis/json_log_viewer/json_log_viewer.py"
    ]
@@ -836,7 +1168,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "debug-gym (3.12.3)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -850,7 +1182,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.12.1"
   }
  },
  "nbformat": 4,
diff --git a/requirements.txt b/requirements.txt
index e60c649c..b81e53e8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ pytest==8.3.5
 gitignore_parser==0.1.12
 termcolor==3.1.0
 transformers==4.51.3
-tiktoken==0.9.0
+tiktoken
 docker==7.1.0
 swebench==4.0.3
 swesmith==0.0.4
diff --git a/scripts/config_free_env.yaml b/scripts/config_free_env.yaml
new file mode 100644
index 00000000..2f9b06cc
--- /dev/null
+++ b/scripts/config_free_env.yaml
@@ -0,0 +1,42 @@
+# Configuration for standalone FreeEnv + FreeAgent runs.
+task_name: free-session
+
+llm:
+  name: "4o-az"
+
+# Tools to load into the environment toolbox.
+tools:
+  - rewrite
+  - bash
+  - submit:
+      eval_on_submit: False  # Here we only terminate after submission, no auto-eval.
+
+environment:
+  image: jyangballin/swesmith.x86_64.amueller_1776_word_cloud.ec24191c:latest
+  workspace_dir: /testbed
+  terminal:
+    type: docker
+    # type: kubernetes
+    # registry: debuggymacr.azurecr.io
+    # namespace: mtl-cpu-jobs
+    # kube_config: ~/.kube/config
+    # # kube_context: null
+    # pod_spec_kwargs:
+    #   tolerations:
+    #     - key: node.kubernetes.io/disk-pressure
+    #       operator: Exists
+    #       effect: NoExecute
+    #       tolerationSeconds: 10800
+    #     - key: kubernetes.azure.com/scalesetpriority
+    #       operator: Equal
+    #       value: spot
+    #       effect: NoSchedule
+    #     - key: CriticalAddonsOnly
+    #       operator: Equal
+    #       value: "true"
+    #       effect: NoSchedule
+
+agent:
+  random_seed: 42
+  max_steps: 20
+  output_path: exps/free_env
diff --git a/scripts/free_env_human.py b/scripts/free_env_human.py
new file mode 100644
index 00000000..a82a9e6a
--- /dev/null
+++ b/scripts/free_env_human.py
@@ -0,0 +1,200 @@
+"""Interactive FreeEnv demo that runs a container image with a human operator."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+from typing import Any, Iterable
+
+from debug_gym.gym.envs.free_env import FreeEnv
+from debug_gym.gym.terminals import select_terminal
+from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.gym.tools.toolbox import Toolbox
+from debug_gym.llms.human import Human
+from debug_gym.logger import DebugGymLogger
+
+DEFAULT_IMAGE = "swesmith.x86_64.amueller__word_cloud.ec24191c"
+DEFAULT_TOOLS = [
+    "listdir",
+    "view",
+    "grep",
+    "rewrite",
+    "bash",
+    {"submit": {"eval_on_submit": False}},
+]
+
+
+def format_observations(env_info) -> list[dict]:
+    messages = [
+        {
+            "role": "system",
+            "content": env_info.instructions or "Interact with the repository.",
+        }
+    ]
+
+    instructions_text = (env_info.instructions or "").strip()
+    for index, observation in enumerate(env_info.all_observations):
+        text = observation.observation.strip()
+        if index == 0 and text == instructions_text:
+            continue
+        prefix = f"[{observation.source}] " if observation.source else ""
+        messages.append({"role": "user", "content": f"{prefix}{text}"})
+    return messages
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Launch a FreeEnv session with human-in-the-loop control.",
+    )
+    parser.add_argument(
+        "--image",
+        default=DEFAULT_IMAGE,
+        help="Docker image name to load inside the environment.",
+    )
+    parser.add_argument(
+        "--terminal",
+        default="docker",
+        choices=["docker", "kubernetes"],
+        help="Terminal backend to use.",
+    )
+    parser.add_argument(
+        "--registry",
+        default=None,
+        help="Optional registry prefix (e.g. ghcr.io/swe-bench).",
+    )
+    parser.add_argument(
+        "--workspace-dir",
+        default="/testbed",
+        help="Working directory inside the container or pod.",
+    )
+    parser.add_argument(
+        "--mount-path",
+        type=Path,
+        default=None,
+        help="Optional host path whose contents should be copied into the environment.",
+    )
+    parser.add_argument(
+        "--setup-command",
+        action="append",
+        default=[],
+        help="Additional setup commands to run when the terminal starts (repeatable).",
+    )
+    parser.add_argument(
+        "--tool",
+        dest="tools",
+        action="append",
+        default=None,
+        help="Tool name to add to the toolbox (can be specified multiple times).",
+    )
+    parser.add_argument(
+        "--init-git",
+        action="store_true",
+        help="Initialize a git repository inside the environment (disabled by default).",
+    )
+    parser.add_argument(
+        "--instructions",
+        default=None,
+        help="Custom instruction text displayed at reset.",
+    )
+    parser.add_argument(
+        "--max-retries",
+        type=int,
+        default=10,
+        help="Maximum number of retries for invalid human tool calls.",
+    )
+    parser.add_argument(
+        "--dir-tree-depth",
+        type=int,
+        default=2,
+        help="Depth of the directory tree shown in observations.",
+    )
+    return parser
+
+
+def _add_tools(env: FreeEnv, tool_specs: Iterable[Any], logger: DebugGymLogger) -> None:
+    """Attach toolbox entries, defaulting submit to eval_on_submit=False for humans."""
+
+    for spec in tool_specs:
+        tool_kwargs: dict[str, Any] = {}
+        if isinstance(spec, dict):
+            if len(spec) != 1:
+                raise ValueError("Tool dictionary must contain exactly one entry")
+            spec = dict(spec)
+            tool_name, tool_kwargs = next(iter(spec.items()))
+        else:
+            tool_name = str(spec)
+
+        if tool_name == "submit" and "eval_on_submit" not in tool_kwargs:
+            tool_kwargs = {**tool_kwargs, "eval_on_submit": False}
+
+        env.add_tool(Toolbox.get_tool(tool_name, **tool_kwargs))
+        logger.debug("Loaded tool %s with options %s", tool_name, tool_kwargs)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+
+    logger = DebugGymLogger("free-env-demo")
+
+    tool_specs: list[Any]
+    if args.tools:
+        # User-specified tools override defaults but still respect submit behaviour.
+        tool_specs = list(args.tools)
+    else:
+        tool_specs = list(DEFAULT_TOOLS)
+
+    terminal_config: dict[str, Any] = {
+        "type": args.terminal,
+        "base_image": args.image,
+        "working_dir": args.workspace_dir,
+    }
+    if args.setup_command:
+        terminal_config["setup_commands"] = list(args.setup_command)
+    if args.registry:
+        terminal_config["registry"] = args.registry
+
+    terminal: Terminal | None = select_terminal(terminal_config, logger=logger)
+
+    env = FreeEnv(
+        image=args.image,
+        terminal=terminal,
+        mount_path=args.mount_path,
+        setup_commands=args.setup_command,
+        instructions=args.instructions,
+        init_git=args.init_git,
+        workspace_dir=args.workspace_dir,
+        logger=logger,
+        dir_tree_depth=args.dir_tree_depth,
+    )
+
+    _add_tools(env, tool_specs, logger)
+    logger.info("Loaded tools: %s", env.tool_names)
+
+    info = env.reset()
+    human = Human(logger=logger, max_retries=args.max_retries)
+
+    try:
+        while True:
+            messages = format_observations(info)
+            response = human(messages, env.tools)
+            logger.info(
+                "Running %s with arguments %s",
+                response.tool.name,
+                response.tool.arguments,
+            )
+            info = env.step(
+                response.tool,
+                action_content=response.response,
+            )
+    except KeyboardInterrupt:
+        logger.info("Session interrupted by user.")
+    except ValueError as exc:
+        logger.error("Session terminated: %s", exc)
+    finally:
+        env.close()
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/run_free_env.py b/scripts/run_free_env.py
new file mode 100644
index 00000000..13d7b367
--- /dev/null
+++ b/scripts/run_free_env.py
@@ -0,0 +1,163 @@
+"""Standalone runner for FreeEnv + FreeAgent with human-visible logging."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+from typing import Any, Mapping
+
+from debug_gym.agents.free_agent import FreeAgent
+from debug_gym.gym.envs.free_env import FreeEnv
+from debug_gym.gym.terminals import select_terminal
+from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.gym.tools.toolbox import Toolbox
+from debug_gym.llms.base import LLM
+from debug_gym.llms.human import Human
+from debug_gym.logger import DebugGymLogger
+
+
+def build_parser() -> argparse.ArgumentParser:
+    """Create the CLI parser that exposes the runner configuration flag."""
+    parser = argparse.ArgumentParser(description="Run FreeAgent against FreeEnv.")
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path("scripts/config_free_env.yaml"),
+        help="Path to the YAML configuration file.",
+    )
+    return parser
+
+
+def load_app_config(path: Path) -> dict:
+    """Load the YAML configuration used to seed the environment and agent."""
+    import yaml
+
+    with open(path, "r", encoding="utf-8") as handle:
+        return yaml.safe_load(handle)
+
+
+def build_llm(config: dict, logger: DebugGymLogger):
+    """Instantiate the LLM (or human driver) based on configuration defaults."""
+    llm_cfg = config.get("llm") or {}
+    llm_name = llm_cfg.get("name") or config.get("llm_name") or "human"
+
+    if llm_name.lower() == "human":
+        return Human(model_name="human", logger=logger)
+
+    return LLM.instantiate(
+        llm_name=llm_name,
+        llm_config_file_path=llm_cfg.get("config_file")
+        or config.get("llm_config_file_path"),
+        logger=logger,
+    )
+
+
+def resolve_terminal(
+    env_config: Mapping[str, Any],
+    logger: DebugGymLogger,
+) -> Terminal | None:
+    """Resolve the requested terminal backend, normalizing legacy config shapes."""
+    terminal_setting = env_config.get("terminal")
+
+    if isinstance(terminal_setting, Terminal):
+        return terminal_setting
+
+    if terminal_setting is None:
+        terminal_config: dict[str, Any] = {"type": "docker"}
+    elif isinstance(terminal_setting, str):
+        terminal_config = {"type": terminal_setting}
+    elif isinstance(terminal_setting, Mapping):
+        terminal_config = dict(terminal_setting)
+    else:
+        raise TypeError(
+            "terminal configuration must be a mapping, string, Terminal, or None",
+        )
+
+    terminal_config.setdefault("type", "docker")
+    terminal_config["type"] = str(terminal_config["type"]).lower()
+    terminal_config.setdefault("base_image", env_config["image"])
+    terminal_config.setdefault(
+        "working_dir", env_config.get("workspace_dir", "/testbed")
+    )
+
+    setup_commands = env_config.get("setup_commands")
+    if setup_commands:
+        terminal_config.setdefault("setup_commands", list(setup_commands))
+
+    overrides = dict(env_config.get("terminal_kwargs") or {})
+    terminal_config.update(overrides)
+
+    return select_terminal(terminal_config, logger=logger)
+
+
+def add_tools(env: FreeEnv, tools_config: list[Any], logger: DebugGymLogger) -> None:
+    """Instantiate tools defined in config, honoring optional per-tool kwargs."""
+
+    for tool_entry in tools_config:
+        tool_kwargs: dict[str, Any] = {}
+        if isinstance(tool_entry, Mapping):
+            if len(tool_entry) != 1:
+                raise ValueError("Tool mapping entries must contain a single tool name")
+            tool_entry = dict(tool_entry)
+            tool_name, tool_kwargs = next(iter(tool_entry.items()))
+        else:
+            tool_name = str(tool_entry)
+
+        if tool_name == "submit" and "eval_on_submit" not in tool_kwargs:
+            tool_kwargs = {**tool_kwargs, "eval_on_submit": False}
+
+        env.add_tool(Toolbox.get_tool(tool_name, **tool_kwargs))
+        logger.debug("Added tool %s with options %s", tool_name, tool_kwargs)
+
+
+def main() -> int:
+    """Entrypoint for running FreeAgent against FreeEnv from the command line."""
+    args = build_parser().parse_args()
+    config = load_app_config(args.config)
+
+    logger = DebugGymLogger("free-agent-run")
+
+    env_cfg = config["environment"]
+    terminal = resolve_terminal(env_cfg, logger)
+    # Copy only the knobs understood by FreeEnv, leaving unrelated config behind.
+    env_kwargs = dict(
+        image=env_cfg["image"],
+        terminal=terminal,
+        mount_path=env_cfg.get("mount_path"),
+        setup_commands=env_cfg.get("setup_commands"),
+        instructions=env_cfg.get("instructions"),
+        init_git=env_cfg.get("init_git", True),
+        workspace_dir=env_cfg.get("workspace_dir", "/testbed"),
+        logger=logger,
+        dir_tree_depth=env_cfg.get("dir_tree_depth", 2),
+    )
+
+    # Instantiate the environment once the terminal and core parameters are ready.
+    env = FreeEnv(**env_kwargs)
+
+    tools_config = config.get("tools")
+    if not tools_config:
+        raise ValueError(
+            "Configuration must specify a non-empty 'tools' list for FreeEnv sessions."
+        )
+
+    add_tools(env, tools_config, logger)
+
+    llm = build_llm(config, logger)
+    agent_config = config.get("agent", {})
+    agent = FreeAgent(config=agent_config, env=env, llm=llm, logger=logger)
+
+    task_name = config.get("task_name", "free-session")
+
+    try:
+        resolved = agent.run(task_name=task_name)
+        agent.save_trajectory(task_name=task_name)
+        agent.save_patch(task_name=task_name)
+        logger.info(f"Run complete. Resolved={resolved}")
+        return 0
+    finally:
+        env.close()
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/agents/conftest.py b/tests/agents/conftest.py
index 6d9375ee..503faf2c 100644
--- a/tests/agents/conftest.py
+++ b/tests/agents/conftest.py
@@ -27,7 +27,7 @@ def agent_setup(tmp_path, open_data):
     def _length(text):
         return len(text)
 
-    def _agent_setup(agent_class):
+    def _agent_setup(agent_class, *, config_override=None):
         with (
             patch("tiktoken.encoding_for_model") as mock_encoding_for_model,
             patch("os.path.exists", return_value=True),
@@ -48,6 +48,8 @@ def _agent_setup(agent_class):
                 "output_path": str(tmp_path),
                 "random_seed": 42,
             }
+            if config_override:
+                config_dict.update(config_override)
             env = MagicMock()
             llm = MagicMock()
             llm.reasoning_end_token = None
diff --git a/tests/agents/test_free_agent.py b/tests/agents/test_free_agent.py
new file mode 100644
index 00000000..0bb9ffd2
--- /dev/null
+++ b/tests/agents/test_free_agent.py
@@ -0,0 +1,62 @@
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from debug_gym.agents.base_agent import BaseAgent
+from debug_gym.agents.free_agent import FreeAgent
+
+
+@pytest.fixture
+def make_free_agent(agent_setup):
+    def _factory(*, config_override=None):
+        agent, env, llm = next(agent_setup(FreeAgent, config_override=config_override))
+        agent.logger = MagicMock()
+        return agent, env, llm
+
+    return _factory
+
+
+def test_free_agent_run_delegates_to_base(make_free_agent):
+    agent, _, _ = make_free_agent()
+
+    with patch.object(BaseAgent, "run", return_value=True) as mock_run:
+        result = agent.run(task_name="demo", debug=True)
+
+    mock_run.assert_called_once_with(task_name="demo", debug=True)
+    assert result is True
+
+
+def test_free_agent_reraises_root_cause_for_missing_reset(make_free_agent):
+    agent, _, _ = make_free_agent()
+
+    def side_effect(*args, **kwargs):
+        try:
+            raise RuntimeError("reset failed")
+        except RuntimeError as exc:  # pragma: no cover - exercised below
+            raise AttributeError(
+                "'NoneType' object has no attribute 'max_score'"
+            ) from exc
+
+    with patch.object(BaseAgent, "run", side_effect=side_effect):
+        with pytest.raises(RuntimeError) as excinfo:
+            agent.run(task_name="demo")
+
+    assert str(excinfo.value) == "reset failed"
+    agent.logger.error.assert_called_once()
+
+
+def test_free_agent_bubbles_unrelated_attribute_error(make_free_agent):
+    agent, _, _ = make_free_agent()
+
+    with patch.object(BaseAgent, "run", side_effect=AttributeError("other")):
+        with pytest.raises(AttributeError, match="other"):
+            agent.run(task_name="demo")
+
+    agent.logger.error.assert_not_called()
+
+
+def test_free_agent_system_prompt_override(make_free_agent):
+    custom_prompt = "Inspect quietly."
+    agent, _, _ = make_free_agent(config_override={"system_prompt": custom_prompt})
+
+    assert agent.system_prompt == custom_prompt
diff --git a/tests/gym/envs/test_free_env.py b/tests/gym/envs/test_free_env.py
new file mode 100644
index 00000000..99fd3ecd
--- /dev/null
+++ b/tests/gym/envs/test_free_env.py
@@ -0,0 +1,133 @@
+from pathlib import Path
+from typing import Any
+from unittest.mock import MagicMock
+
+from debug_gym.gym.envs.free_env import FreeEnv
+from debug_gym.gym.terminals.local import LocalTerminal
+from debug_gym.gym.terminals.terminal import Terminal
+
+
+class DummyTerminal(Terminal):
+    """Test helper terminal with minimal behavior for FreeEnv interactions."""
+
+    def __init__(
+        self,
+        *,
+        working_dir: str = "/tmp/test",
+        logger: Any | None = None,
+        base_image: str | None = None,
+        setup_commands: list[str] | None = None,
+    ):
+        super().__init__(working_dir=working_dir, logger=logger)
+        self.base_image = base_image
+        self.setup_commands = list(setup_commands or [])
+        self.closed = False
+
+    def prepare_command(self, entrypoint):
+        return ["/bin/true"]
+
+    def run(self, entrypoint, timeout=None, raises=False, strip_output=True):
+        if isinstance(entrypoint, str) and "tree" in entrypoint:
+            return True, "/workspace\n"
+        return True, ""
+
+    @property
+    def default_shell_command(self):
+        return "/bin/true"
+
+    def new_shell_session(self):
+        return None
+
+    def copy_content(self, src, target=None):
+        return None
+
+    def close(self):
+        self.closed = True
+
+
+def test_free_env_defaults_to_local_terminal():
+    logger = MagicMock()
+
+    env = FreeEnv(image="ubuntu:22.04", logger=logger)
+
+    assert isinstance(env.terminal, LocalTerminal)
+    assert env.container_image == "ubuntu:22.04"
+
+
+def test_free_env_configures_existing_terminal():
+    logger = MagicMock()
+    terminal_logger = MagicMock()
+    terminal = DummyTerminal(
+        working_dir="/initial",
+        logger=terminal_logger,
+        base_image="base",
+        setup_commands=["existing"],
+    )
+
+    env = FreeEnv(
+        image="ubuntu:22.04",
+        terminal=terminal,
+        setup_commands=["apt update"],
+        workspace_dir="/workspace",
+        logger=logger,
+        init_git=False,
+    )
+
+    env.reset()
+
+    assert env.terminal is terminal
+    assert terminal.base_image == "ubuntu:22.04"
+    assert terminal.working_dir == "/workspace"
+    assert terminal.logger is logger
+    assert terminal.setup_commands == ["apt update"]
+
+
+def test_free_env_respects_custom_workspace(tmp_path):
+    logger = MagicMock()
+    terminal = DummyTerminal(logger=logger)
+
+    env = FreeEnv(
+        image="ubuntu:22.04",
+        terminal=terminal,
+        workspace_dir="/workspace",
+        logger=logger,
+        init_git=False,
+    )
+
+    env.reset()
+
+    assert env.workspace.working_dir == Path("/workspace")
+    assert terminal.working_dir == "/workspace"
+
+
+def test_free_env_reset_allows_dynamic_overrides():
+    logger = MagicMock()
+    terminal = DummyTerminal(logger=logger, setup_commands=["initial"])
+
+    env = FreeEnv(
+        image="ubuntu:22.04",
+        terminal=terminal,
+        setup_commands=["initial"],
+        workspace_dir="/workspace",
+        logger=logger,
+        init_git=True,
+    )
+
+    env.reset(
+        options={
+            "image": "ubuntu:24.04",
+            "workspace_dir": "/new",
+            "setup_commands": ["echo ready"],
+            "instructions": "Inspect carefully.",
+            "init_git": False,
+        }
+    )
+
+    assert env.container_image == "ubuntu:24.04"
+    assert env.instructions == "Inspect carefully."
+    assert env.init_git is False
+    assert env._workspace_dir == "/new"
+    assert terminal.working_dir == "/new"
+    assert terminal.setup_commands == ["echo ready"]
+    assert terminal.base_image == "ubuntu:24.04"
+    assert terminal.closed is True
diff --git a/tests/gym/terminals/test_terminal.py b/tests/gym/terminals/test_terminal.py
index 3867ea18..eb2ac54c 100644
--- a/tests/gym/terminals/test_terminal.py
+++ b/tests/gym/terminals/test_terminal.py
@@ -154,3 +154,37 @@ def test_select_terminal_unknown():
 def test_select_terminal_invalid_config():
     with pytest.raises(TypeError):
         select_terminal("not a dict")
+
+
+def test_select_terminal_kubernetes_extra_labels(monkeypatch):
+    captured = {}
+
+    class DummyK8s:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+
+    monkeypatch.setattr(
+        "debug_gym.gym.terminals.KubernetesTerminal",
+        DummyK8s,
+    )
+
+    config = {
+        "type": "kubernetes",
+        "namespace": "example",
+        "extra_labels": {"foo": "bar"},
+        "pod_spec_kwargs": {"tolerations": []},
+    }
+
+    terminal = select_terminal(config, uuid="1234")
+
+    assert isinstance(terminal, DummyK8s)
+    assert captured["namespace"] == "example"
+    assert captured["pod_spec_kwargs"] == {"tolerations": []}
+    assert captured["extra_labels"] == {"foo": "bar", "uuid": "1234"}
+    assert "logger" in captured
+    assert config == {
+        "type": "kubernetes",
+        "namespace": "example",
+        "extra_labels": {"foo": "bar"},
+        "pod_spec_kwargs": {"tolerations": []},
+    }

From b8d6092a4df6956e0776ba27b285e6f6148b2dee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Tue, 9 Dec 2025 13:03:55 -0800
Subject: [PATCH 02/12] Simplifying FreeEnv

---
 debug_gym/agents/__init__.py     |   2 -
 debug_gym/agents/froggy_agent.py |   7 +-
 debug_gym/agents/utils.py        |  45 +++----
 debug_gym/experiment.py          |  17 ++-
 debug_gym/gym/envs/__init__.py   |   2 +-
 debug_gym/gym/envs/free_env.py   | 214 +++++++++----------------------
 debug_gym/gym/workspace.py       |   2 +-
 scripts/config_free_env.yaml     |  37 ++----
 scripts/run.py                   |   9 +-
 tests/test_experiment.py         |  87 ++++++++-----
 10 files changed, 172 insertions(+), 250 deletions(-)

diff --git a/debug_gym/agents/__init__.py b/debug_gym/agents/__init__.py
index c955af01..3c51e0a4 100644
--- a/debug_gym/agents/__init__.py
+++ b/debug_gym/agents/__init__.py
@@ -1,5 +1,4 @@
 from debug_gym.agents.base_agent import BaseAgent, register_agent
-from debug_gym.agents.free_agent import FreeAgent
 from debug_gym.agents.froggy_agent import FroggyAgent
 from debug_gym.agents.solution_agent import AgentSolution
 from debug_gym.agents.swe_agent import SWEAgent
@@ -7,7 +6,6 @@
 __all__ = [
     "BaseAgent",
     "register_agent",
-    "FreeAgent",
     "FroggyAgent",
     "AgentSolution",
     "SWEAgent",
diff --git a/debug_gym/agents/froggy_agent.py b/debug_gym/agents/froggy_agent.py
index 22a1da0e..e2a5a8d3 100644
--- a/debug_gym/agents/froggy_agent.py
+++ b/debug_gym/agents/froggy_agent.py
@@ -11,7 +11,7 @@
 
 @dataclass
 class FroggyAgentArgs(AgentArgs):
-    max_rewrite_steps: int = -1
+    max_rewrite_steps: int | None = None
     show_directory_tree: int = 0
     show_current_breakpoints: bool = False
 
@@ -24,7 +24,10 @@ class FroggyAgent(BaseAgent):
 
     def should_stop(self, step: int, info: EnvInfo):
         should_stop, reason = super().should_stop(step, info)
-        if info.rewrite_counter > self.args.max_rewrite_steps:
+        if (
+            self.args.max_rewrite_steps is not None
+            and info.rewrite_counter > self.args.max_rewrite_steps
+        ):
             should_stop = True
             reason = "max_rewrite_steps reached"
         return should_stop, reason
diff --git a/debug_gym/agents/utils.py b/debug_gym/agents/utils.py
index 8b3771f5..80df0e8f 100644
--- a/debug_gym/agents/utils.py
+++ b/debug_gym/agents/utils.py
@@ -6,15 +6,14 @@
 
 import yaml
 
+from debug_gym.agents.base_agent import AGENT_REGISTRY
 from debug_gym.logger import DebugGymLogger
 
 
 def load_config():
     parser = argparse.ArgumentParser()
-    parser.add_argument("config_file", help="path to config file")
-    parser.add_argument(
-        "--agent",
-    )
+    parser.add_argument("--config", help="path to config file")
+    parser.add_argument("--agent", help="agent to use")
     parser.add_argument(
         "--debug",
         action="store_true",
@@ -104,34 +103,22 @@ def load_config():
         " e.g. -p 'rewrite_only.random_seed=123'",
     )
     args = parser.parse_args()
-    assert os.path.exists(args.config_file), "Invalid config file"
-    with open(args.config_file) as reader:
-        config = yaml.safe_load(reader)
-
-    available_agents = [item for item in list(config.keys()) if item != "base"]
-
-    if not args.agent:
-        # pick first agent
-        args.agent = available_agents[0]
-    elif args.agent not in available_agents:
+    config = {}
+    if args.config is not None:
+        assert os.path.exists(args.config), "Invalid config file"
+        with open(args.config) as reader:
+            config = yaml.safe_load(reader)
+
+    args.agent = args.agent or "froggy"
+    if args.agent not in AGENT_REGISTRY:
         raise ValueError(
-            f"Invalid agent: {args.agent}. Available agents: {available_agents}"
+            f"Invalid agent: {args.agent}. Available agents: {sorted(AGENT_REGISTRY)}"
         )
 
-    if "base" in config:
-        # base config is specified (shared across agents)
-        return_config = config["base"]
-        # Override base config with agent specific config
-        for key, value in config[args.agent].items():
-            return_config[key] = value
-    else:
-        # base config is not specified
-        return_config = config[args.agent]
-
     # Parse overriden params.
     for param in args.params:
         fqn_key, value = param.split("=")
-        entry_to_change = return_config
+        entry_to_change = config
         keys = fqn_key.split(".")
         for k in keys[:-1]:
             if k not in entry_to_change:
@@ -141,10 +128,10 @@ def load_config():
         entry_to_change[keys[-1]] = yaml.safe_load(value)
 
     # assume agent type is the key if not specified by the user
-    if not return_config.get("agent_type"):
-        return_config["agent_type"] = args.agent
+    if not config.get("agent_type"):
+        config["agent_type"] = args.agent
 
-    return return_config, args
+    return config, args
 
 
 def save_patch(env, problem_path: Path, logger: DebugGymLogger):
diff --git a/debug_gym/experiment.py b/debug_gym/experiment.py
index d2aae2bc..35fce00e 100644
--- a/debug_gym/experiment.py
+++ b/debug_gym/experiment.py
@@ -11,14 +11,14 @@
 from debug_gym.logger import DebugGymLogger
 
 
-def create_env(config: dict, logger: DebugGymLogger):
-    terminal = select_terminal(config.get("terminal"), logger)
-    env_class = select_env(config.get("benchmark"))
+def create_env(config: dict, task_data: dict, logger: DebugGymLogger):
+    terminal = select_terminal(config.get("terminal"), logger, uuid=config["uuid"])
+    env_class = select_env(task_data["env_type"])
     env = env_class(
-        **config["env_kwargs"],
-        problems=config.get("problems", ["custom"]),
+        task_data=task_data,
         terminal=terminal,
         logger=logger,
+        **config.get("env", {}),
     )
     return env
 
@@ -26,7 +26,12 @@ def create_env(config: dict, logger: DebugGymLogger):
 def add_tools(env, config: dict, logger: DebugGymLogger):
     """Add tools to the environment"""
     for tool in config["tools"]:
-        tool_instantiated = Toolbox.get_tool(tool)
+        tool_config = {}
+        if isinstance(tool, dict):
+            assert len(tool) == 1, "Tool dict must have exactly one key"
+            tool, tool_config = list(tool.items())[0]
+
+        tool_instantiated = Toolbox.get_tool(tool, **tool_config)
         env.add_tool(tool_instantiated)
         logger.debug(f"Adding tool to toolbox: {tool_instantiated.__class__.__name__}")
 
diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py
index 8327e5bc..3743db9d 100644
--- a/debug_gym/gym/envs/__init__.py
+++ b/debug_gym/gym/envs/__init__.py
@@ -41,7 +41,7 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
             return MiniNightmareEnv
         case "r2egym":
             return R2EGymEnv
-        case "free":
+        case "FreeEnv":
             return FreeEnv
         case _:
             raise ValueError(f"Unknown environment {env_type}")
diff --git a/debug_gym/gym/envs/free_env.py b/debug_gym/gym/envs/free_env.py
index 1942f249..0c1ffade 100644
--- a/debug_gym/gym/envs/free_env.py
+++ b/debug_gym/gym/envs/free_env.py
@@ -1,142 +1,96 @@
 from __future__ import annotations
 
-import shlex
 from pathlib import Path
 from typing import Any
 
 from debug_gym.gym.envs.env import RepoEnv
-from debug_gym.gym.terminals.local import LocalTerminal
+from debug_gym.gym.terminals.docker import DockerTerminal
+from debug_gym.gym.terminals.kubernetes import KubernetesTerminal
 from debug_gym.gym.terminals.terminal import Terminal
-from debug_gym.logger import DebugGymLogger
 
 
 class FreeEnv(RepoEnv):
-    """Lightweight RepoEnv wrapper for running arbitrary container images."""
-
-    DEFAULT_TASK_NAME = "free-session"
+    """Repo environment that allows an agent to freely explore a codebase."""
 
     def __init__(
         self,
-        image: str,
+        task_data: dict | None = None,
         *,
+        image: str | None = None,
         terminal: Terminal | None = None,
-        mount_path: str | Path | None = None,
-        setup_commands: list[str] | None = None,
-        instructions: str | None = None,
-        init_git: bool = True,
+        local_path: str | Path | None = None,
         workspace_dir: str | Path = "/testbed",
-        logger: DebugGymLogger | None = None,
-        **env_kwargs: Any,
+        setup_commands: list[str] | None = None,
+        **kwargs: Any,
     ) -> None:
-        """Create a free-form environment backed by an existing repository terminal."""
-        self.container_image = image
-        self._custom_instructions = (instructions or "").strip()
-        self.init_git = init_git
-        self._setup_commands = list(setup_commands or [])
-        self._workspace_dir = str(workspace_dir)
-
-        shared_logger = logger or DebugGymLogger("debug-gym")
+        terminal = terminal or DockerTerminal(logger=kwargs.get("logger"))
+        if not isinstance(terminal, (DockerTerminal, KubernetesTerminal)):
+            raise ValueError(
+                f"{self.__class__.__name__} only supports DockerTerminal and KubernetesTerminal."
+            )
 
+        task_data = task_data or {
+            "image": image,
+            "local_path": local_path,
+            "workspace_dir": workspace_dir,
+            "setup_commands": setup_commands or [],  # TODO: use setup_commands
+        }
         super().__init__(
-            path=str(mount_path) if mount_path is not None else None,
-            entrypoint="true",
-            debug_entrypoint="true",
-            max_score=0,
+            task_data=task_data,
             terminal=terminal,
-            logger=shared_logger,
-            **env_kwargs,
+            **kwargs,
         )
 
-        if self.terminal is not None:
-            self._apply_terminal_settings()
-
-    def _apply_terminal_settings(self) -> None:
-        """Keep terminal metadata (image/setup commands) in sync with env state."""
-        terminal = self.terminal
-        if terminal is None:
-            return
-        if hasattr(terminal, "base_image"):
-            setattr(terminal, "base_image", self.container_image)
-
-        if hasattr(terminal, "setup_commands"):
-            terminal.setup_commands = list(self._setup_commands)
-
-        if hasattr(terminal, "working_dir") and not isinstance(terminal, LocalTerminal):
-            try:
-                terminal.working_dir = self._workspace_dir
-            except ValueError:
-                self.logger.debug(
-                    "Terminal already active; keeping working_dir=%s",
-                    getattr(terminal, "working_dir", self._workspace_dir),
-                )
-
-        if hasattr(terminal, "task_name"):
-            try:
-                terminal.task_name = self.DEFAULT_TASK_NAME
-            except ValueError:
-                self.logger.debug(
-                    "Terminal already active; keeping existing task name."
-                )
-
-        terminal.logger = self.logger
-
-    def load_dataset(self, problems: str | list[str] | None = None):
-        """Expose a single synthetic task keyed by DEFAULT_TASK_NAME."""
-        return {self.DEFAULT_TASK_NAME: {"image": self.container_image}}
-
-    def setup_task(self, task_name: str | None, options: dict | None = None) -> None:
-        """Record base image metadata for consistency with RepoEnv expectations."""
-        self.task_name = task_name or self.DEFAULT_TASK_NAME
-        self.base_image = self.container_image
-        if hasattr(self.terminal, "base_image"):
-            setattr(self.terminal, "base_image", self.base_image)
-
-    def setup_workspace(self) -> None:
-        """Ensure the remote workspace matches the configured working directory."""
-        if isinstance(self.terminal, LocalTerminal):
-            super().setup_workspace()
-            return
+    @property
+    def task_name(self):
+        return f"FreeEnv({self.task_data['image']})"
 
+    def setup_task(self) -> None:
+        self.terminal.task_name = self.task_name
+        self.terminal.base_image = self.task_data["image"]
+
+    def setup_workspace(self):
+        self.workspace.working_dir = self.task_data["workspace_dir"]
         self.workspace.reset()
-        self.workspace.working_dir = Path(self._workspace_dir)
-        if self.terminal is not None:
-            current_dir = getattr(self.terminal, "working_dir", None)
-            if current_dir != self._workspace_dir:
-                try:
-                    self.terminal.working_dir = self._workspace_dir
-                except ValueError:
-                    self.logger.debug(
-                        "Terminal already active; keeping working_dir=%s", current_dir
-                    )
-            # Ensure core utilities exist before RepoEnv renders directory listings.
-            self.terminal.run(
-                "apt-get update -y && apt-get install -y tree", raises=True
+
+        if self.task_data["local_path"]:
+            self.logger.info(
+                f"Copying content from {self.task_data['local_path']} to {self.workspace.working_dir}."
             )
-            self.terminal.run(
-                f"mkdir -p {shlex.quote(self._workspace_dir)}",
-                raises=True,
+            self.workspace.copy_content(
+                src=self.task_data["local_path"], target=self.workspace.working_dir
             )
 
-        if self.path:
-            self.workspace.copy_content(self.path)
-
-        self.workspace.setup_file_filters()
+        self.workspace.setup_file_filters()  # Use codebase's .debugignore and .debugreadonly.
 
     def setup_terminal(self) -> None:
         """Apply FreeEnv tweaks and reuse RepoEnv git bootstrapping when enabled."""
-        self._apply_terminal_settings()
 
-        if self.terminal is not None:
-            self.terminal.run("touch .debugignore .debugreadonly")
+        self.logger.info(f"Configuring {self.terminal}...")
 
-        if not self.init_git:
-            return
-        if not self._git_available():
-            self.logger.debug(
-                "Git is not available in the container; skipping repository setup.",
-            )
-            return
-        super().setup_terminal()
+        # Ensure core utilities exist before RepoEnv renders directory listings.
+        self.terminal.run(
+            "apt-get update -y && apt-get install -y git tree", raises=True
+        )
+        # self.terminal.run(
+        #     f"mkdir -p {shlex.quote(self._workspace_dir)}",
+        #     raises=True,
+        # )
+
+        if self._git_available():
+            self.terminal.run("git init")
+            self.terminal.run("git config user.name 'debug-gym'")
+            self.terminal.run("git config user.email '<>'")
+
+            # self.terminal.run(
+            #     "git add *.py *.txt"
+            # )  # Aider tasks only have Python and text files.
+            # self.terminal.run("git commit -am 'Init'")
+
+            # self.terminal.run(
+            #     "git add .debugignore .debugreadonly"
+            # )  # Aider tasks come with those.
+            # self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
 
     def _git_available(self) -> bool:
         """Check for git presence before attempting repository initialization."""
@@ -149,48 +103,6 @@ def _git_available(self) -> bool:
     def instructions(self) -> str:
         """Provide user-facing guidance, falling back to a generic sandbox blurb."""
         return (
-            self._custom_instructions
-            or "You are placed in an isolated Linux environment, use the available tools to interact with the environment effectively."
+            "You are placed in an isolated Linux environment,"
+            "use the available tools to interact with the environment effectively."
         )
-
-    def reset(self, *, options: dict | None = None):
-        """Allow callers to mutate container settings before delegating to RepoEnv."""
-        options = options or {}
-
-        image = options.get("image")
-        workspace_dir = options.get("workspace_dir")
-        setup_commands = options.get("setup_commands")
-        instructions = options.get("instructions")
-        init_git = options.get("init_git")
-
-        restart_terminal = False
-
-        if image and image != self.container_image:
-            self.container_image = image
-            restart_terminal = True
-
-        if workspace_dir and str(workspace_dir) != self._workspace_dir:
-            self._workspace_dir = str(workspace_dir)
-            restart_terminal = True
-
-        if setup_commands is not None:
-            new_commands = list(setup_commands)
-            if new_commands != self._setup_commands:
-                self._setup_commands = new_commands
-                restart_terminal = True
-
-        if instructions is not None:
-            self._custom_instructions = instructions
-
-        if init_git is not None:
-            self.init_git = bool(init_git)
-
-        if restart_terminal and self.terminal is not None:
-            try:
-                self.terminal.close()
-            except Exception as exc:  # noqa: BLE001 - diagnostics only
-                self.logger.debug("Failed to close terminal cleanly: %s", exc)
-
-        self._apply_terminal_settings()
-
-        return super().reset(options=options)
diff --git a/debug_gym/gym/workspace.py b/debug_gym/gym/workspace.py
index 48ac9401..ba22b895 100644
--- a/debug_gym/gym/workspace.py
+++ b/debug_gym/gym/workspace.py
@@ -38,7 +38,7 @@ def reset(
     ):
         self.cleanup()
 
-        self.working_dir = Path("/testbed")
+        self.working_dir = self.working_dir or Path("/testbed")
         # only create temp dir for local terminal
         if type(self.terminal) is LocalTerminal:
             self._tempdir = tempfile.TemporaryDirectory(prefix="DebugGym-")
diff --git a/scripts/config_free_env.yaml b/scripts/config_free_env.yaml
index 2f9b06cc..888627a8 100644
--- a/scripts/config_free_env.yaml
+++ b/scripts/config_free_env.yaml
@@ -1,8 +1,8 @@
 # Configuration for standalone FreeEnv + FreeAgent runs.
 task_name: free-session
+output_path: exps/free_env
 
-llm:
-  name: "4o-az"
+llm_name: "human"
 
 # Tools to load into the environment toolbox.
 tools:
@@ -11,32 +11,15 @@ tools:
   - submit:
       eval_on_submit: False  # Here we only terminate after submission, no auto-eval.
 
-environment:
-  image: jyangballin/swesmith.x86_64.amueller_1776_word_cloud.ec24191c:latest
+task_data:
+  env_type: FreeEnv
+  image: ubuntu:22.04
+  local_path: /home/macote/src/debug-gym/data/mini_nightmare/pandas_dataframe
   workspace_dir: /testbed
-  terminal:
-    type: docker
-    # type: kubernetes
-    # registry: debuggymacr.azurecr.io
-    # namespace: mtl-cpu-jobs
-    # kube_config: ~/.kube/config
-    # # kube_context: null
-    # pod_spec_kwargs:
-    #   tolerations:
-    #     - key: node.kubernetes.io/disk-pressure
-    #       operator: Exists
-    #       effect: NoExecute
-    #       tolerationSeconds: 10800
-    #     - key: kubernetes.azure.com/scalesetpriority
-    #       operator: Equal
-    #       value: spot
-    #       effect: NoSchedule
-    #     - key: CriticalAddonsOnly
-    #       operator: Equal
-    #       value: "true"
-    #       effect: NoSchedule
+
+terminal:
+  type: docker
 
 agent:
-  random_seed: 42
+  type: froggy
   max_steps: 20
-  output_path: exps/free_env
diff --git a/scripts/run.py b/scripts/run.py
index 97d51b69..2dfe66c6 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -6,8 +6,9 @@
 from pathlib import Path
 
 from debug_gym.agents.base_agent import AGENT_REGISTRY, create_agent
-from debug_gym.agents.utils import load_config
+from debug_gym.agents.utils import load_config, save_patch, save_trajectory
 from debug_gym.experiment import add_tools, create_env, dump_experiment_info
+from debug_gym.gym.envs import load_dataset
 from debug_gym.llms.base import LLM
 from debug_gym.llms.human import Human
 from debug_gym.logger import DebugGymLogger, load_previous_run_status
@@ -179,7 +180,11 @@ def main():
     dump_experiment_info(config, args)
 
     # Load the dataset based on the information found in the config.
-    dataset = load_dataset(config["dataset"], logger=logger)
+    if config.get("task_data") is not None:
+        dataset = {f"custom-task": config["task_data"]}
+    else:
+        dataset = load_dataset(config["dataset"], logger=logger)
+
     problems = sorted(dataset)
 
     if args.list:
diff --git a/tests/test_experiment.py b/tests/test_experiment.py
index b1c12387..a725af6f 100644
--- a/tests/test_experiment.py
+++ b/tests/test_experiment.py
@@ -7,7 +7,7 @@
 import pytest
 
 from debug_gym.experiment import add_tools, create_env, dump_experiment_info
-from debug_gym.gym.envs.local import LocalEnv
+from debug_gym.gym.envs.free_env import FreeEnv
 from debug_gym.gym.tools.bash import BashTool
 from debug_gym.gym.tools.view import ViewTool
 from debug_gym.logger import DebugGymLogger
@@ -29,7 +29,7 @@ class TestCreateEnv:
     """Test cases for create_env function"""
 
     def test_create_env_basic(self, tmp_path):
-        """Test basic environment creation with local environment"""
+        """Test basic environment creation with FreeEnv"""
         # Setup logger
         logger = DebugGymLogger("test")
 
@@ -38,23 +38,26 @@ def test_create_env_basic(self, tmp_path):
         repo_path.mkdir()
         (repo_path / "test.py").write_text("# test file")
 
-        # Setup config for LocalEnv
+        # Setup config and task_data for FreeEnv
         config = {
-            "terminal": {"type": "local"},
-            "benchmark": "local",
-            "env_kwargs": {"path": str(repo_path)},
-            "problems": ["custom"],
+            "terminal": {"type": "docker"},
+            "uuid": "test-uuid-123",
+        }
+        task_data = {
+            "env_type": "FreeEnv",
+            "image": "python:3.11",
+            "local_path": str(repo_path),
         }
 
         # Call function
-        result = create_env(config, logger)
+        result = create_env(config, task_data, logger)
 
-        # Assertions - verify we got a real LocalEnv instance
-        assert isinstance(result, LocalEnv)
+        # Assertions - verify we got a real FreeEnv instance
+        assert isinstance(result, FreeEnv)
         assert result.logger == logger
 
-    def test_create_env_default_problems(self, tmp_path):
-        """Test environment creation uses default problems when not specified"""
+    def test_create_env_with_env_config(self, tmp_path):
+        """Test environment creation with env config"""
         # Setup logger
         logger = DebugGymLogger("test")
 
@@ -62,18 +65,23 @@ def test_create_env_default_problems(self, tmp_path):
         repo_path = tmp_path / "test_repo"
         repo_path.mkdir()
 
-        # Setup config without problems
+        # Setup config with env config
         config = {
-            "terminal": {"type": "local"},
-            "benchmark": "local",
-            "env_kwargs": {"path": str(repo_path)},
+            "terminal": {"type": "docker"},
+            "uuid": "test-uuid-456",
+            "env": {"some_option": "value"},
+        }
+        task_data = {
+            "env_type": "FreeEnv",
+            "image": "python:3.11",
+            "local_path": str(repo_path),
         }
 
         # Call function
-        result = create_env(config, logger)
+        result = create_env(config, task_data, logger)
 
-        # Assertions - should use default ["custom"]
-        assert isinstance(result, LocalEnv)
+        # Assertions - FreeEnv should be created
+        assert isinstance(result, FreeEnv)
 
     def test_create_env_with_terminal_none(self, tmp_path):
         """Test environment creation with no terminal (None)"""
@@ -87,16 +95,19 @@ def test_create_env_with_terminal_none(self, tmp_path):
         # Setup config with terminal=None
         config = {
             "terminal": None,
-            "benchmark": "local",
-            "env_kwargs": {"path": str(repo_path)},
-            "problems": [],
+            "uuid": "test-uuid-789",
+        }
+        task_data = {
+            "env_type": "FreeEnv",
+            "image": "python:3.11",
+            "local_path": str(repo_path),
         }
 
         # Call function
-        result = create_env(config, logger)
+        result = create_env(config, task_data, logger)
 
-        # Assertions - LocalEnv should be created even with terminal=None
-        assert isinstance(result, LocalEnv)
+        # Assertions - FreeEnv should be created even with terminal=None
+        assert isinstance(result, FreeEnv)
 
 
 class TestAddTools:
@@ -104,10 +115,16 @@ class TestAddTools:
 
     def test_add_tools_single_tool(self, tmp_path):
         """Test adding a single tool to environment"""
-        # Create a real environment
+        # Create a real environment - use FreeEnv with task_data
         repo_path = tmp_path / "test_repo"
         repo_path.mkdir()
-        env = LocalEnv(path=str(repo_path))
+
+        task_data = {
+            "env_type": "FreeEnv",
+            "image": "python:3.11",
+            "local_path": str(repo_path),
+        }
+        env = FreeEnv(task_data=task_data)
         logger = DebugGymLogger("test")
 
         # Setup config
@@ -125,7 +142,13 @@ def test_add_tools_multiple_tools(self, tmp_path):
         # Create a real environment
         repo_path = tmp_path / "test_repo"
         repo_path.mkdir()
-        env = LocalEnv(path=str(repo_path))
+
+        task_data = {
+            "env_type": "FreeEnv",
+            "image": "python:3.11",
+            "local_path": str(repo_path),
+        }
+        env = FreeEnv(task_data=task_data)
         logger = DebugGymLogger("test")
 
         # Setup config with multiple tools
@@ -145,7 +168,13 @@ def test_add_tools_empty_list(self, tmp_path):
         # Create a real environment
         repo_path = tmp_path / "test_repo"
         repo_path.mkdir()
-        env = LocalEnv(path=str(repo_path))
+
+        task_data = {
+            "env_type": "FreeEnv",
+            "image": "python:3.11",
+            "local_path": str(repo_path),
+        }
+        env = FreeEnv(task_data=task_data)
         logger = DebugGymLogger("test")
 
         # Setup config with no tools

From 2bc4856da75aa4c3d2f81388031b8023ce22be85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Tue, 9 Dec 2025 13:21:23 -0800
Subject: [PATCH 03/12] Fixing config

---
 debug_gym/agents/utils.py      | 5 ++++-
 debug_gym/gym/envs/free_env.py | 4 ++--
 scripts/run.py                 | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/debug_gym/agents/utils.py b/debug_gym/agents/utils.py
index 80df0e8f..754eb176 100644
--- a/debug_gym/agents/utils.py
+++ b/debug_gym/agents/utils.py
@@ -117,7 +117,10 @@ def load_config():
 
     # Parse overriden params.
     for param in args.params:
-        fqn_key, value = param.split("=")
+        fqn_key, value = param, ""
+        if "=" in param:
+            fqn_key, value = param.split("=")
+
         entry_to_change = config
         keys = fqn_key.split(".")
         for k in keys[:-1]:
diff --git a/debug_gym/gym/envs/free_env.py b/debug_gym/gym/envs/free_env.py
index 0c1ffade..c4ffd331 100644
--- a/debug_gym/gym/envs/free_env.py
+++ b/debug_gym/gym/envs/free_env.py
@@ -50,10 +50,10 @@ def setup_task(self) -> None:
         self.terminal.base_image = self.task_data["image"]
 
     def setup_workspace(self):
-        self.workspace.working_dir = self.task_data["workspace_dir"]
+        self.workspace.working_dir = self.task_data.get("workspace_dir", "/testbed")
         self.workspace.reset()
 
-        if self.task_data["local_path"]:
+        if self.task_data.get("local_path"):
             self.logger.info(
                 f"Copying content from {self.task_data['local_path']} to {self.workspace.working_dir}."
             )
diff --git a/scripts/run.py b/scripts/run.py
index 2dfe66c6..d00cc1f8 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -173,6 +173,7 @@ def run_agent(args, task_name: str, task_data: dict, config: dict):
 def main():
     config, args = load_config()
     config["uuid"] = config.get("uuid", str(uuid.uuid4()))
+    config["output_path"] = config.get("output_path", "exps")
     exp_output_path = Path(config["output_path"]) / config["uuid"]
     exp_output_path.mkdir(parents=True, exist_ok=True)
     logger = DebugGymLogger("debug-gym", level=args.logging_level)

From 7d70af521a46c425f1236a5afceaf79ac0212f4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Wed, 10 Dec 2025 08:18:49 -0800
Subject: [PATCH 04/12] Cleanup of freeenv + small refactoring

---
 .../config_aider.yaml => configs/aider.yaml   |   0
 {scripts => configs}/config.yaml              |   0
 .../free_env.yaml                             |   6 +
 .../mini_nightmare.yaml                       |   0
 .../config_r2egym.yaml => configs/r2egym.yaml |   0
 .../swebench.yaml                             |   0
 .../swebench_debug.yaml                       |   0
 .../swesmith.yaml                             |   0
 .../human_friendly_system_prompt.jinja        |   0
 debug_gym/agents/base_agent.py                |  41 ++--
 debug_gym/agents/free_agent.py                |  44 ----
 debug_gym/gym/envs/free_env.py                |  11 +-
 scripts/free_env_human.py                     | 200 ------------------
 scripts/replay.py                             |   3 +-
 scripts/run.py                                |   7 +-
 scripts/run_free_env.py                       | 163 --------------
 16 files changed, 34 insertions(+), 441 deletions(-)
 rename scripts/config_aider.yaml => configs/aider.yaml (100%)
 rename {scripts => configs}/config.yaml (100%)
 rename scripts/config_free_env.yaml => configs/free_env.yaml (52%)
 rename scripts/config_mini_nightmare.yaml => configs/mini_nightmare.yaml (100%)
 rename scripts/config_r2egym.yaml => configs/r2egym.yaml (100%)
 rename scripts/config_swebench.yaml => configs/swebench.yaml (100%)
 rename scripts/config_swebench_debug.yaml => configs/swebench_debug.yaml (100%)
 rename scripts/config_swesmith.yaml => configs/swesmith.yaml (100%)
 rename {scripts => configs}/templates/human_friendly_system_prompt.jinja (100%)
 delete mode 100644 debug_gym/agents/free_agent.py
 delete mode 100644 scripts/free_env_human.py
 delete mode 100644 scripts/run_free_env.py

diff --git a/scripts/config_aider.yaml b/configs/aider.yaml
similarity index 100%
rename from scripts/config_aider.yaml
rename to configs/aider.yaml
diff --git a/scripts/config.yaml b/configs/config.yaml
similarity index 100%
rename from scripts/config.yaml
rename to configs/config.yaml
diff --git a/scripts/config_free_env.yaml b/configs/free_env.yaml
similarity index 52%
rename from scripts/config_free_env.yaml
rename to configs/free_env.yaml
index 888627a8..2ce3e55a 100644
--- a/scripts/config_free_env.yaml
+++ b/configs/free_env.yaml
@@ -23,3 +23,9 @@ terminal:
 agent:
   type: froggy
   max_steps: 20
+  system_prompt: |
+    You are assisting in an exploratory codebase understanding session inside an open-ended container.
+    You have access to a set of tools to inspect and modify the codebase.
+    Your goal is to use the tools to gather as much information about the codebase as possible.
+    Output both your thinking process (if any) and the tool call (must) in the response.
+    When you are done exploring, use the submit tool as the final action to end the session.
diff --git a/scripts/config_mini_nightmare.yaml b/configs/mini_nightmare.yaml
similarity index 100%
rename from scripts/config_mini_nightmare.yaml
rename to configs/mini_nightmare.yaml
diff --git a/scripts/config_r2egym.yaml b/configs/r2egym.yaml
similarity index 100%
rename from scripts/config_r2egym.yaml
rename to configs/r2egym.yaml
diff --git a/scripts/config_swebench.yaml b/configs/swebench.yaml
similarity index 100%
rename from scripts/config_swebench.yaml
rename to configs/swebench.yaml
diff --git a/scripts/config_swebench_debug.yaml b/configs/swebench_debug.yaml
similarity index 100%
rename from scripts/config_swebench_debug.yaml
rename to configs/swebench_debug.yaml
diff --git a/scripts/config_swesmith.yaml b/configs/swesmith.yaml
similarity index 100%
rename from scripts/config_swesmith.yaml
rename to configs/swesmith.yaml
diff --git a/scripts/templates/human_friendly_system_prompt.jinja b/configs/templates/human_friendly_system_prompt.jinja
similarity index 100%
rename from scripts/templates/human_friendly_system_prompt.jinja
rename to configs/templates/human_friendly_system_prompt.jinja
diff --git a/debug_gym/agents/base_agent.py b/debug_gym/agents/base_agent.py
index 586d7eea..db0e751b 100644
--- a/debug_gym/agents/base_agent.py
+++ b/debug_gym/agents/base_agent.py
@@ -84,10 +84,7 @@ class BaseAgent:
     name: str = None
     args_class = AgentArgs
     system_prompt: str = ""
-    instance_prompt: str = """\
-{
-  "Instructions": {{ info.instructions }}
-}"""
+    instance_prompt: str = "Instructions: {{ info.instructions }}"
 
     def __init__(
         self,
@@ -96,9 +93,9 @@ def __init__(
         logger: DebugGymLogger | None = None,
     ):
         self.args = self.args_class.make(agent_args or {})
-        self.llm = llm
         self.history = HistoryTracker()
         self.logger = logger or DebugGymLogger("debug-gym")
+        self.llm = None
         self.env = None
 
         # Override prompts if provided in args
@@ -163,7 +160,9 @@ def _load_prompt_template(self, template: str) -> Template:
     def build_system_prompt(self, info: EnvInfo | None = None) -> dict:
         """Build system prompt using the default template or one provided in args."""
         system_prompt_template = self._load_prompt_template(self.system_prompt)
+        self.logger.debug(f"Loaded system prompt template:\n{self.system_prompt}")
         system_prompt = system_prompt_template.render(agent=self, info=info)
+        self.logger.debug(f"Rendered system prompt:\n{system_prompt}")
 
         # TODO: should we call self.llm.convert_observation_to_message(system_prompt) ?
         return {"role": "system", "content": filter_non_utf8(system_prompt)}
@@ -171,7 +170,9 @@ def build_system_prompt(self, info: EnvInfo | None = None) -> dict:
     def build_instance_prompt(self, info: EnvInfo | None = None) -> dict:
         """Build instance prompt using the default template or one provided in args."""
         instance_prompt_template = self._load_prompt_template(self.instance_prompt)
+        self.logger.debug(f"Loaded instance prompt template:\n{self.instance_prompt}")
         instance_prompt = instance_prompt_template.render(agent=self, info=info)
+        self.logger.debug(f"Rendered instance prompt:\n{instance_prompt}")
         return self.llm.convert_observation_to_message(instance_prompt)
 
     def build_history_prompt(self) -> list[dict]:
@@ -237,9 +238,10 @@ def should_stop(self, step: int, info: EnvInfo):
             reason = "max_steps reached"
         return should_stop, reason
 
-    def run(self, env: RepoEnv, debug=False):
-        info = None
+    def run(self, env: RepoEnv, llm: LLM, debug=False):
         self.env = env
+        self.llm = llm
+        info = None
         step = 0
 
         try:
@@ -251,8 +253,8 @@ def run(self, env: RepoEnv, debug=False):
             if info.resolved:
                 self.logger.report_progress(
                     problem_id=env.task_name,
-                    step=1,
-                    total_steps=1,
+                    step=0,
+                    total_steps=self.args.max_steps,
                     score=info.score,
                     max_score=info.max_score,
                     status="resolved",
@@ -269,7 +271,7 @@ def run(self, env: RepoEnv, debug=False):
             step = 1
 
             while not should_stop:
-                self.logger.info(f"\n{'='*20} STEP {step+1} {'='*20}\n")
+                self.logger.info(f"\n{'='*20} STEP {step} {'='*20}\n")
 
                 messages = self.build_prompt(info)
                 llm_response = self.llm(messages, info.tools)
@@ -283,7 +285,7 @@ def run(self, env: RepoEnv, debug=False):
                     llm_response.reasoning_response,
                 )
                 self.history.step(info, llm_response)
-                should_stop, reason = self.should_stop(step, info)
+                should_stop, reason = self.should_stop(step + 1, info)
                 status = (
                     "resolved"
                     if info.resolved
@@ -338,13 +340,10 @@ def _build_trajectory(self) -> Dict[str, Any]:
         return json_output
 
 
-def create_agent(
-    agent_type: str,
-    *,
-    agent_args: AgentArgs | Dict[str, Any] | None = None,
-    config: Dict[str, Any] | None = None,
-    **agent_kwargs,
-):
+def create_agent(config: Dict[str, Any], **kwargs) -> BaseAgent:
+    """Create an agent from the config dictionary."""
+
+    agent_type = config.pop("agent_type", "froggy")
     if agent_type in AGENT_REGISTRY:
         agent_class = AGENT_REGISTRY[agent_type]
     elif "." in agent_type:
@@ -360,9 +359,5 @@ def create_agent(
     else:
         raise ValueError(f"Unknown agent type: {agent_type}")
 
-    agent_args = agent_args or config
-    if agent_args is None:
-        raise ValueError("Either agent_args or config must be provided.")
-
-    agent = agent_class(agent_args=agent_args, **agent_kwargs)
+    agent = agent_class(agent_args=config, **kwargs)
     return agent
diff --git a/debug_gym/agents/free_agent.py b/debug_gym/agents/free_agent.py
deleted file mode 100644
index 951bde3b..00000000
--- a/debug_gym/agents/free_agent.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""Simple agent example for interacting with FreeEnv."""
-
-from debug_gym.agents.base_agent import BaseAgent, register_agent
-
-
-@register_agent
-class FreeAgent(BaseAgent):
-    """Minimal reasoning agent tailored for FreeEnv sessions."""
-
-    name = "free_agent"
-    # Customized system instructions keep FreeEnv light-weight while still
-    # providing the model with a structured exploration checklist.
-    system_prompt = (
-        "You are assisting in an exploratory codebase understanding session inside an open-ended container.\n"
-        "You have access to a set of tools to inspect and modify the codebase.\n"
-        "Your goal is to use the tools to gather as much information about the codebase as possible.\n"
-        "Output both your thinking process (if any) and the tool call (must) in the response.\n"
-        "When you are done exploring, use the submit tool as the final action to end the session."
-    )
-
-    def __init__(self, config, env, llm=None, logger=None):
-        super().__init__(config=config, env=env, llm=llm, logger=logger)
-
-        override_prompt = config.get("system_prompt")
-        if override_prompt is not None:
-            self.system_prompt = str(override_prompt)
-
-    def run(self, task_name=None, debug=False):
-        """Wrap BaseAgent.run to surface clearer errors when startup fails."""
-        try:
-            return super().run(task_name=task_name, debug=debug)
-        except AttributeError as exc:
-            error_msg = str(exc)
-            sentinel = "'NoneType' object has no attribute 'max_score'"
-            if sentinel not in error_msg:
-                raise
-
-            root_cause = exc.__context__ or exc.__cause__ or exc
-            self.logger.error(
-                "FreeAgent failed to reset the environment before receiving initial observations. "
-                "Check that the configured container image exists and is accessible."
-            )
-
-            raise root_cause
diff --git a/debug_gym/gym/envs/free_env.py b/debug_gym/gym/envs/free_env.py
index c4ffd331..250a9b4e 100644
--- a/debug_gym/gym/envs/free_env.py
+++ b/debug_gym/gym/envs/free_env.py
@@ -33,7 +33,8 @@ def __init__(
             "image": image,
             "local_path": local_path,
             "workspace_dir": workspace_dir,
-            "setup_commands": setup_commands or [],  # TODO: use setup_commands
+            "setup_commands": setup_commands
+            or ["apt-get update -y && apt-get install -y git tree"],
         }
         super().__init__(
             task_data=task_data,
@@ -64,14 +65,14 @@ def setup_workspace(self):
         self.workspace.setup_file_filters()  # Use codebase's .debugignore and .debugreadonly.
 
     def setup_terminal(self) -> None:
-        """Apply FreeEnv tweaks and reuse RepoEnv git bootstrapping when enabled."""
+        """Apply FreeEnv tweaks and reuse RepoEnv git boo? but the agent cantstrapping when enabled."""
 
         self.logger.info(f"Configuring {self.terminal}...")
 
         # Ensure core utilities exist before RepoEnv renders directory listings.
-        self.terminal.run(
-            "apt-get update -y && apt-get install -y git tree", raises=True
-        )
+        for cmd in self.task_data.get("setup_commands", []):
+            self.terminal.run(cmd, raises=True)
+
         # self.terminal.run(
         #     f"mkdir -p {shlex.quote(self._workspace_dir)}",
         #     raises=True,
diff --git a/scripts/free_env_human.py b/scripts/free_env_human.py
deleted file mode 100644
index a82a9e6a..00000000
--- a/scripts/free_env_human.py
+++ /dev/null
@@ -1,200 +0,0 @@
-"""Interactive FreeEnv demo that runs a container image with a human operator."""
-
-from __future__ import annotations
-
-import argparse
-from pathlib import Path
-from typing import Any, Iterable
-
-from debug_gym.gym.envs.free_env import FreeEnv
-from debug_gym.gym.terminals import select_terminal
-from debug_gym.gym.terminals.terminal import Terminal
-from debug_gym.gym.tools.toolbox import Toolbox
-from debug_gym.llms.human import Human
-from debug_gym.logger import DebugGymLogger
-
-DEFAULT_IMAGE = "swesmith.x86_64.amueller__word_cloud.ec24191c"
-DEFAULT_TOOLS = [
-    "listdir",
-    "view",
-    "grep",
-    "rewrite",
-    "bash",
-    {"submit": {"eval_on_submit": False}},
-]
-
-
-def format_observations(env_info) -> list[dict]:
-    messages = [
-        {
-            "role": "system",
-            "content": env_info.instructions or "Interact with the repository.",
-        }
-    ]
-
-    instructions_text = (env_info.instructions or "").strip()
-    for index, observation in enumerate(env_info.all_observations):
-        text = observation.observation.strip()
-        if index == 0 and text == instructions_text:
-            continue
-        prefix = f"[{observation.source}] " if observation.source else ""
-        messages.append({"role": "user", "content": f"{prefix}{text}"})
-    return messages
-
-
-def build_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(
-        description="Launch a FreeEnv session with human-in-the-loop control.",
-    )
-    parser.add_argument(
-        "--image",
-        default=DEFAULT_IMAGE,
-        help="Docker image name to load inside the environment.",
-    )
-    parser.add_argument(
-        "--terminal",
-        default="docker",
-        choices=["docker", "kubernetes"],
-        help="Terminal backend to use.",
-    )
-    parser.add_argument(
-        "--registry",
-        default=None,
-        help="Optional registry prefix (e.g. ghcr.io/swe-bench).",
-    )
-    parser.add_argument(
-        "--workspace-dir",
-        default="/testbed",
-        help="Working directory inside the container or pod.",
-    )
-    parser.add_argument(
-        "--mount-path",
-        type=Path,
-        default=None,
-        help="Optional host path whose contents should be copied into the environment.",
-    )
-    parser.add_argument(
-        "--setup-command",
-        action="append",
-        default=[],
-        help="Additional setup commands to run when the terminal starts (repeatable).",
-    )
-    parser.add_argument(
-        "--tool",
-        dest="tools",
-        action="append",
-        default=None,
-        help="Tool name to add to the toolbox (can be specified multiple times).",
-    )
-    parser.add_argument(
-        "--init-git",
-        action="store_true",
-        help="Initialize a git repository inside the environment (disabled by default).",
-    )
-    parser.add_argument(
-        "--instructions",
-        default=None,
-        help="Custom instruction text displayed at reset.",
-    )
-    parser.add_argument(
-        "--max-retries",
-        type=int,
-        default=10,
-        help="Maximum number of retries for invalid human tool calls.",
-    )
-    parser.add_argument(
-        "--dir-tree-depth",
-        type=int,
-        default=2,
-        help="Depth of the directory tree shown in observations.",
-    )
-    return parser
-
-
-def _add_tools(env: FreeEnv, tool_specs: Iterable[Any], logger: DebugGymLogger) -> None:
-    """Attach toolbox entries, defaulting submit to eval_on_submit=False for humans."""
-
-    for spec in tool_specs:
-        tool_kwargs: dict[str, Any] = {}
-        if isinstance(spec, dict):
-            if len(spec) != 1:
-                raise ValueError("Tool dictionary must contain exactly one entry")
-            spec = dict(spec)
-            tool_name, tool_kwargs = next(iter(spec.items()))
-        else:
-            tool_name = str(spec)
-
-        if tool_name == "submit" and "eval_on_submit" not in tool_kwargs:
-            tool_kwargs = {**tool_kwargs, "eval_on_submit": False}
-
-        env.add_tool(Toolbox.get_tool(tool_name, **tool_kwargs))
-        logger.debug("Loaded tool %s with options %s", tool_name, tool_kwargs)
-
-
-def main(argv: list[str] | None = None) -> int:
-    parser = build_parser()
-    args = parser.parse_args(argv)
-
-    logger = DebugGymLogger("free-env-demo")
-
-    tool_specs: list[Any]
-    if args.tools:
-        # User-specified tools override defaults but still respect submit behaviour.
-        tool_specs = list(args.tools)
-    else:
-        tool_specs = list(DEFAULT_TOOLS)
-
-    terminal_config: dict[str, Any] = {
-        "type": args.terminal,
-        "base_image": args.image,
-        "working_dir": args.workspace_dir,
-    }
-    if args.setup_command:
-        terminal_config["setup_commands"] = list(args.setup_command)
-    if args.registry:
-        terminal_config["registry"] = args.registry
-
-    terminal: Terminal | None = select_terminal(terminal_config, logger=logger)
-
-    env = FreeEnv(
-        image=args.image,
-        terminal=terminal,
-        mount_path=args.mount_path,
-        setup_commands=args.setup_command,
-        instructions=args.instructions,
-        init_git=args.init_git,
-        workspace_dir=args.workspace_dir,
-        logger=logger,
-        dir_tree_depth=args.dir_tree_depth,
-    )
-
-    _add_tools(env, tool_specs, logger)
-    logger.info("Loaded tools: %s", env.tool_names)
-
-    info = env.reset()
-    human = Human(logger=logger, max_retries=args.max_retries)
-
-    try:
-        while True:
-            messages = format_observations(info)
-            response = human(messages, env.tools)
-            logger.info(
-                "Running %s with arguments %s",
-                response.tool.name,
-                response.tool.arguments,
-            )
-            info = env.step(
-                response.tool,
-                action_content=response.response,
-            )
-    except KeyboardInterrupt:
-        logger.info("Session interrupted by user.")
-    except ValueError as exc:
-        logger.error("Session terminated: %s", exc)
-    finally:
-        env.close()
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/scripts/replay.py b/scripts/replay.py
index 6ed2a364..b6be1ae1 100644
--- a/scripts/replay.py
+++ b/scripts/replay.py
@@ -186,8 +186,7 @@ def run_task(args, problem, config):
 
         agent = create_agent(
             config["agent_type"],
-            config=config,
-            env=env,
+            agent_args=config,
             llm=llm,
             logger=task_logger,
         )
diff --git a/scripts/run.py b/scripts/run.py
index d00cc1f8..2b04c264 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -95,14 +95,13 @@ def run_agent(args, task_name: str, task_data: dict, config: dict):
         )
 
         agent = create_agent(
-            config["agent_type"],
-            agent_args=config,
-            llm=llm,
+            config["agent"],
+            # llm=llm,
             logger=task_logger,
         )
 
         try:
-            success = agent.run(env, debug=args.debug)
+            success = agent.run(env, llm, debug=args.debug)
         except KeyboardInterrupt:
             task_logger.error("Agent run was interrupted by user.")
             task_logger.report_progress(
diff --git a/scripts/run_free_env.py b/scripts/run_free_env.py
deleted file mode 100644
index 13d7b367..00000000
--- a/scripts/run_free_env.py
+++ /dev/null
@@ -1,163 +0,0 @@
-"""Standalone runner for FreeEnv + FreeAgent with human-visible logging."""
-
-from __future__ import annotations
-
-import argparse
-from pathlib import Path
-from typing import Any, Mapping
-
-from debug_gym.agents.free_agent import FreeAgent
-from debug_gym.gym.envs.free_env import FreeEnv
-from debug_gym.gym.terminals import select_terminal
-from debug_gym.gym.terminals.terminal import Terminal
-from debug_gym.gym.tools.toolbox import Toolbox
-from debug_gym.llms.base import LLM
-from debug_gym.llms.human import Human
-from debug_gym.logger import DebugGymLogger
-
-
-def build_parser() -> argparse.ArgumentParser:
-    """Create the CLI parser that exposes the runner configuration flag."""
-    parser = argparse.ArgumentParser(description="Run FreeAgent against FreeEnv.")
-    parser.add_argument(
-        "--config",
-        type=Path,
-        default=Path("scripts/config_free_env.yaml"),
-        help="Path to the YAML configuration file.",
-    )
-    return parser
-
-
-def load_app_config(path: Path) -> dict:
-    """Load the YAML configuration used to seed the environment and agent."""
-    import yaml
-
-    with open(path, "r", encoding="utf-8") as handle:
-        return yaml.safe_load(handle)
-
-
-def build_llm(config: dict, logger: DebugGymLogger):
-    """Instantiate the LLM (or human driver) based on configuration defaults."""
-    llm_cfg = config.get("llm") or {}
-    llm_name = llm_cfg.get("name") or config.get("llm_name") or "human"
-
-    if llm_name.lower() == "human":
-        return Human(model_name="human", logger=logger)
-
-    return LLM.instantiate(
-        llm_name=llm_name,
-        llm_config_file_path=llm_cfg.get("config_file")
-        or config.get("llm_config_file_path"),
-        logger=logger,
-    )
-
-
-def resolve_terminal(
-    env_config: Mapping[str, Any],
-    logger: DebugGymLogger,
-) -> Terminal | None:
-    """Resolve the requested terminal backend, normalizing legacy config shapes."""
-    terminal_setting = env_config.get("terminal")
-
-    if isinstance(terminal_setting, Terminal):
-        return terminal_setting
-
-    if terminal_setting is None:
-        terminal_config: dict[str, Any] = {"type": "docker"}
-    elif isinstance(terminal_setting, str):
-        terminal_config = {"type": terminal_setting}
-    elif isinstance(terminal_setting, Mapping):
-        terminal_config = dict(terminal_setting)
-    else:
-        raise TypeError(
-            "terminal configuration must be a mapping, string, Terminal, or None",
-        )
-
-    terminal_config.setdefault("type", "docker")
-    terminal_config["type"] = str(terminal_config["type"]).lower()
-    terminal_config.setdefault("base_image", env_config["image"])
-    terminal_config.setdefault(
-        "working_dir", env_config.get("workspace_dir", "/testbed")
-    )
-
-    setup_commands = env_config.get("setup_commands")
-    if setup_commands:
-        terminal_config.setdefault("setup_commands", list(setup_commands))
-
-    overrides = dict(env_config.get("terminal_kwargs") or {})
-    terminal_config.update(overrides)
-
-    return select_terminal(terminal_config, logger=logger)
-
-
-def add_tools(env: FreeEnv, tools_config: list[Any], logger: DebugGymLogger) -> None:
-    """Instantiate tools defined in config, honoring optional per-tool kwargs."""
-
-    for tool_entry in tools_config:
-        tool_kwargs: dict[str, Any] = {}
-        if isinstance(tool_entry, Mapping):
-            if len(tool_entry) != 1:
-                raise ValueError("Tool mapping entries must contain a single tool name")
-            tool_entry = dict(tool_entry)
-            tool_name, tool_kwargs = next(iter(tool_entry.items()))
-        else:
-            tool_name = str(tool_entry)
-
-        if tool_name == "submit" and "eval_on_submit" not in tool_kwargs:
-            tool_kwargs = {**tool_kwargs, "eval_on_submit": False}
-
-        env.add_tool(Toolbox.get_tool(tool_name, **tool_kwargs))
-        logger.debug("Added tool %s with options %s", tool_name, tool_kwargs)
-
-
-def main() -> int:
-    """Entrypoint for running FreeAgent against FreeEnv from the command line."""
-    args = build_parser().parse_args()
-    config = load_app_config(args.config)
-
-    logger = DebugGymLogger("free-agent-run")
-
-    env_cfg = config["environment"]
-    terminal = resolve_terminal(env_cfg, logger)
-    # Copy only the knobs understood by FreeEnv, leaving unrelated config behind.
-    env_kwargs = dict(
-        image=env_cfg["image"],
-        terminal=terminal,
-        mount_path=env_cfg.get("mount_path"),
-        setup_commands=env_cfg.get("setup_commands"),
-        instructions=env_cfg.get("instructions"),
-        init_git=env_cfg.get("init_git", True),
-        workspace_dir=env_cfg.get("workspace_dir", "/testbed"),
-        logger=logger,
-        dir_tree_depth=env_cfg.get("dir_tree_depth", 2),
-    )
-
-    # Instantiate the environment once the terminal and core parameters are ready.
-    env = FreeEnv(**env_kwargs)
-
-    tools_config = config.get("tools")
-    if not tools_config:
-        raise ValueError(
-            "Configuration must specify a non-empty 'tools' list for FreeEnv sessions."
-        )
-
-    add_tools(env, tools_config, logger)
-
-    llm = build_llm(config, logger)
-    agent_config = config.get("agent", {})
-    agent = FreeAgent(config=agent_config, env=env, llm=llm, logger=logger)
-
-    task_name = config.get("task_name", "free-session")
-
-    try:
-        resolved = agent.run(task_name=task_name)
-        agent.save_trajectory(task_name=task_name)
-        agent.save_patch(task_name=task_name)
-        logger.info(f"Run complete. Resolved={resolved}")
-        return 0
-    finally:
-        env.close()
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())

From 67bf7b7da28d38e46da768cd5c420e3dc45d9ee0 Mon Sep 17 00:00:00 2001
From: "Xingdi (Eric) Yuan" <xingdi-eric-yuan@users.noreply.github.com>
Date: Wed, 10 Dec 2025 11:28:16 -0500
Subject: [PATCH 05/12] Update free_env.yaml

---
 configs/free_env.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/free_env.yaml b/configs/free_env.yaml
index 2ce3e55a..1feb0bbb 100644
--- a/configs/free_env.yaml
+++ b/configs/free_env.yaml
@@ -6,7 +6,7 @@ llm_name: "human"
 
 # Tools to load into the environment toolbox.
 tools:
-  - rewrite
+  - edit
   - bash
   - submit:
       eval_on_submit: False  # Here we only terminate after submission, no auto-eval.

From 543dec4de42e4da3249b82a8855082beed9e9eba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Wed, 10 Dec 2025 09:22:06 -0800
Subject: [PATCH 06/12] More cleanup

---
 debug_gym/agents/utils.py       |  6 ++--
 debug_gym/experiment.py         |  6 ++++
 debug_gym/llms/base.py          |  7 ++--
 scripts/replay.py               | 34 +++++-------------
 scripts/run.py                  | 23 +++---------
 tests/agents/test_free_agent.py | 62 ---------------------------------
 tests/llms/test_base.py         | 12 +++----
 7 files changed, 35 insertions(+), 115 deletions(-)
 delete mode 100644 tests/agents/test_free_agent.py

diff --git a/debug_gym/agents/utils.py b/debug_gym/agents/utils.py
index 60aa01b5..0dc1790d 100644
--- a/debug_gym/agents/utils.py
+++ b/debug_gym/agents/utils.py
@@ -131,8 +131,10 @@ def load_config():
         entry_to_change[keys[-1]] = yaml.safe_load(value)
 
     # assume agent type is the key if not specified by the user
-    if not config.get("agent_type"):
-        config["agent_type"] = args.agent
+    if "agent" not in config:
+        config["agent"] = {}
+    if "type" not in config["agent"]:
+        config["agent"]["type"] = args.agent
 
     return config, args
 
diff --git a/debug_gym/experiment.py b/debug_gym/experiment.py
index 35fce00e..5d438f45 100644
--- a/debug_gym/experiment.py
+++ b/debug_gym/experiment.py
@@ -20,6 +20,8 @@ def create_env(config: dict, task_data: dict, logger: DebugGymLogger):
         logger=logger,
         **config.get("env", {}),
     )
+
+    add_tools(env, config, logger)
     return env
 
 
@@ -30,6 +32,10 @@ def add_tools(env, config: dict, logger: DebugGymLogger):
         if isinstance(tool, dict):
             assert len(tool) == 1, "Tool dict must have exactly one key"
             tool, tool_config = list(tool.items())[0]
+        if isinstance(config["tools"], dict) and isinstance(
+            config["tools"][tool], dict
+        ):
+            tool_config.update(config["tools"][tool])
 
         tool_instantiated = Toolbox.get_tool(tool, **tool_config)
         env.add_tool(tool_instantiated)
diff --git a/debug_gym/llms/base.py b/debug_gym/llms/base.py
index 61f96051..2df00498 100644
--- a/debug_gym/llms/base.py
+++ b/debug_gym/llms/base.py
@@ -2,7 +2,7 @@
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 import yaml
 from tenacity import (
@@ -218,7 +218,7 @@ def __init__(
     @classmethod
     def instantiate(
         cls,
-        llm_name: str,
+        config: Dict[str, Any],
         llm_config_file_path: str | None = None,
         logger: DebugGymLogger | None = None,
     ) -> "LLM":
@@ -232,7 +232,9 @@ def instantiate(
         Returns:
             An instance of the appropriate LLM class.
         """
+
         logger = logger or DebugGymLogger("debug-gym")
+        llm_name = config.get("name")
         if not llm_name:
             return None
 
@@ -242,6 +244,7 @@ def instantiate(
             return Human(llm_name, logger=logger)
 
         llm_config = LLMConfigRegistry.from_file(llm_config_file_path)[llm_name]
+        llm_config.update(config)
 
         tags = llm_config.tags
 
diff --git a/scripts/replay.py b/scripts/replay.py
index b6be1ae1..3dafd211 100644
--- a/scripts/replay.py
+++ b/scripts/replay.py
@@ -34,13 +34,13 @@ def timeout_handler(signum, frame):
         signal.alarm(timeout_seconds)
 
 
-def run_replay_agent(agent, task_name=None, args=None):
+def run_replay_agent(agent, env, llm, task_name=None, args=None):
     step = 0
     info = None
     max_steps = agent.config["max_steps"]
     try:
         agent.history.reset()
-        info = agent.env.reset(options={"task_name": task_name})
+        info = env.reset(options={"task_name": task_name})
         # initial state does not have prompt and response
         agent.history.step(info, None)
 
@@ -57,7 +57,7 @@ def run_replay_agent(agent, task_name=None, args=None):
 
         agent.logger.info(
             "Available tools (in LLM's tool calling format):\n"
-            f"{json.dumps(agent.llm.define_tools(info.tools), indent=4)}\n"
+            f"{json.dumps(llm.define_tools(info.tools), indent=4)}\n"
         )
 
         highscore = info.score
@@ -89,12 +89,12 @@ def run_replay_agent(agent, task_name=None, args=None):
                     f"LLM response - tool call: {llm_response.tool}"
                 )
             else:
-                llm_response = agent.llm(messages, info.tools)
+                llm_response = llm(messages, info.tools)
 
             if args.debug and (args.debug_at is None or step >= args.debug_at):
                 breakpoint()
 
-            info = agent.env.step(
+            info = env.step(
                 llm_response.tool,
                 llm_response.response,
                 llm_response.reasoning_response,
@@ -176,23 +176,11 @@ def run_task(args, problem, config):
         )
 
         env = create_env(config, task_logger)
-        add_tools(env, config, task_logger)
-
-        llm = LLM.instantiate(
-            llm_name=config["llm_name"],
-            llm_config_file_path=config.get("llm_config_file_path"),
-            logger=task_logger,
-        )
-
-        agent = create_agent(
-            config["agent_type"],
-            agent_args=config,
-            llm=llm,
-            logger=task_logger,
-        )
+        llm = LLM.instantiate(config["llm"], logger=task_logger)
+        agent = create_agent(config["agent"], logger=task_logger)
 
         try:
-            success = run_replay_agent(agent, task_name=problem, args=args)
+            success = run_replay_agent(agent, env, llm, task_name=problem, args=args)
         except KeyboardInterrupt:
             task_logger.error("Agent run was interrupted by user.")
             task_logger.report_progress(
@@ -366,11 +354,7 @@ def main():
         len(problems) == 1
     ), "Replay only supports a single problem in the trajectory file."
 
-    llm = LLM.instantiate(
-        llm_name=config["llm_name"],
-        llm_config_file_path=config.get("llm_config_file_path"),
-        logger=logger,
-    )
+    llm = LLM.instantiate(config["llm"], logger=logger)
 
     # Stop live progress display if in Human mode (avoid conflicts with prompt_toolkit)
     if isinstance(llm, Human) or args.debug:
diff --git a/scripts/run.py b/scripts/run.py
index 2b04c264..f7b3c184 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -86,19 +86,8 @@ def run_agent(args, task_name: str, task_data: dict, config: dict):
         )
 
         env = create_env(config, task_data, task_logger)
-        add_tools(env, config, task_logger)
-
-        llm = LLM.instantiate(
-            llm_name=config["llm_name"],
-            llm_config_file_path=config.get("llm_config_file_path"),
-            logger=task_logger,
-        )
-
-        agent = create_agent(
-            config["agent"],
-            # llm=llm,
-            logger=task_logger,
-        )
+        llm = LLM.instantiate(config.get("llm", {}), logger=task_logger)
+        agent = create_agent(config.get("agent", {}), logger=task_logger)
 
         try:
             success = agent.run(env, llm, debug=args.debug)
@@ -176,6 +165,7 @@ def main():
     exp_output_path = Path(config["output_path"]) / config["uuid"]
     exp_output_path.mkdir(parents=True, exist_ok=True)
     logger = DebugGymLogger("debug-gym", level=args.logging_level)
+    logger.debug(f"Experiment config: {config}")
     logger.info(f"Experiment log path: {exp_output_path}")
     dump_experiment_info(config, args)
 
@@ -199,11 +189,8 @@ def main():
 
         return
 
-    llm = LLM.instantiate(
-        llm_name=config["llm_name"],
-        llm_config_file_path=config.get("llm_config_file_path"),
-        logger=logger,
-    )
+    # Try to instantiate the LLM once to catch configuration errors early.
+    llm = LLM.instantiate(config=config["llm"], logger=logger)
 
     # Stop live progress display if --no-live-display is set
     # or in Human mode (avoid conflicts with prompt_toolkit)
diff --git a/tests/agents/test_free_agent.py b/tests/agents/test_free_agent.py
deleted file mode 100644
index 0bb9ffd2..00000000
--- a/tests/agents/test_free_agent.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from debug_gym.agents.base_agent import BaseAgent
-from debug_gym.agents.free_agent import FreeAgent
-
-
-@pytest.fixture
-def make_free_agent(agent_setup):
-    def _factory(*, config_override=None):
-        agent, env, llm = next(agent_setup(FreeAgent, config_override=config_override))
-        agent.logger = MagicMock()
-        return agent, env, llm
-
-    return _factory
-
-
-def test_free_agent_run_delegates_to_base(make_free_agent):
-    agent, _, _ = make_free_agent()
-
-    with patch.object(BaseAgent, "run", return_value=True) as mock_run:
-        result = agent.run(task_name="demo", debug=True)
-
-    mock_run.assert_called_once_with(task_name="demo", debug=True)
-    assert result is True
-
-
-def test_free_agent_reraises_root_cause_for_missing_reset(make_free_agent):
-    agent, _, _ = make_free_agent()
-
-    def side_effect(*args, **kwargs):
-        try:
-            raise RuntimeError("reset failed")
-        except RuntimeError as exc:  # pragma: no cover - exercised below
-            raise AttributeError(
-                "'NoneType' object has no attribute 'max_score'"
-            ) from exc
-
-    with patch.object(BaseAgent, "run", side_effect=side_effect):
-        with pytest.raises(RuntimeError) as excinfo:
-            agent.run(task_name="demo")
-
-    assert str(excinfo.value) == "reset failed"
-    agent.logger.error.assert_called_once()
-
-
-def test_free_agent_bubbles_unrelated_attribute_error(make_free_agent):
-    agent, _, _ = make_free_agent()
-
-    with patch.object(BaseAgent, "run", side_effect=AttributeError("other")):
-        with pytest.raises(AttributeError, match="other"):
-            agent.run(task_name="demo")
-
-    agent.logger.error.assert_not_called()
-
-
-def test_free_agent_system_prompt_override(make_free_agent):
-    custom_prompt = "Inspect quietly."
-    agent, _, _ = make_free_agent(config_override={"system_prompt": custom_prompt})
-
-    assert agent.system_prompt == custom_prompt
diff --git a/tests/llms/test_base.py b/tests/llms/test_base.py
index 53276193..7a882cc2 100644
--- a/tests/llms/test_base.py
+++ b/tests/llms/test_base.py
@@ -70,23 +70,23 @@ def test_instantiate_llm(mock_open, logger_mock):
     assert llm is None
 
     # tags are used to filter models
-    llm = LLM.instantiate("gpt-4o-mini", logger=logger_mock)
+    llm = LLM.instantiate({"name": "gpt-4o-mini"}, logger=logger_mock)
     assert isinstance(llm, OpenAILLM)
 
-    llm = LLM.instantiate("gpt-4o-mini-azure", logger=logger_mock)
+    llm = LLM.instantiate({"name": "gpt-4o-mini-azure"}, logger=logger_mock)
     assert isinstance(llm, AzureOpenAILLM)
 
-    llm = LLM.instantiate("claude-3.7", logger=logger_mock)
+    llm = LLM.instantiate({"name": "claude-3.7"}, logger=logger_mock)
     assert isinstance(llm, AnthropicLLM)
 
-    llm = LLM.instantiate("qwen-3", logger=logger_mock)
+    llm = LLM.instantiate({"name": "qwen-3"}, logger=logger_mock)
     assert isinstance(llm, HuggingFaceLLM)
 
-    llm = LLM.instantiate("human", logger=logger_mock)
+    llm = LLM.instantiate({"name": "human"}, logger=logger_mock)
     assert isinstance(llm, Human)
 
     with pytest.raises(ValueError, match="Model unknown not found in llm config .+"):
-        LLM.instantiate("unknown", logger=logger_mock)
+        LLM.instantiate({"name": "unknown"}, logger=logger_mock)
 
 
 class Tool1(EnvironmentTool):

From 364bb78ac19eb6ead2beb385b51373082fd353fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Wed, 10 Dec 2025 10:54:38 -0800
Subject: [PATCH 07/12] Fixing tests

---
 debug_gym/agents/base_agent.py     |   2 +-
 debug_gym/agents/utils.py          |  17 +---
 debug_gym/experiment.py            |   2 +-
 tests/agents/test_base_agent.py    |  38 ++++-----
 tests/agents/test_froggy_agents.py |   8 +-
 tests/agents/test_utils.py         |  85 ++++++++----------
 tests/gym/envs/test_free_env.py    | 133 -----------------------------
 tests/gym/tools/test_pdb.py        |  26 +-----
 8 files changed, 62 insertions(+), 249 deletions(-)

diff --git a/debug_gym/agents/base_agent.py b/debug_gym/agents/base_agent.py
index db0e751b..4af8b0b0 100644
--- a/debug_gym/agents/base_agent.py
+++ b/debug_gym/agents/base_agent.py
@@ -343,7 +343,7 @@ def _build_trajectory(self) -> Dict[str, Any]:
 def create_agent(config: Dict[str, Any], **kwargs) -> BaseAgent:
     """Create an agent from the config dictionary."""
 
-    agent_type = config.pop("agent_type", "froggy")
+    agent_type = config.get("type", "froggy")
     if agent_type in AGENT_REGISTRY:
         agent_class = AGENT_REGISTRY[agent_type]
     elif "." in agent_type:
diff --git a/debug_gym/agents/utils.py b/debug_gym/agents/utils.py
index 0dc1790d..9dc54ba8 100644
--- a/debug_gym/agents/utils.py
+++ b/debug_gym/agents/utils.py
@@ -10,10 +10,9 @@
 from debug_gym.logger import DebugGymLogger
 
 
-def load_config():
+def load_config(args=None):
     parser = argparse.ArgumentParser()
     parser.add_argument("--config", help="path to config file")
-    parser.add_argument("--agent", help="agent to use")
     parser.add_argument(
         "--debug",
         action="store_true",
@@ -102,19 +101,13 @@ def load_config():
         help="override params of the config file,"
         " e.g. -p 'edit_only.random_seed=123'",
     )
-    args = parser.parse_args()
+    args = parser.parse_args(args)
     config = {}
     if args.config is not None:
         assert os.path.exists(args.config), "Invalid config file"
         with open(args.config) as reader:
             config = yaml.safe_load(reader)
 
-    args.agent = args.agent or "froggy"
-    if args.agent not in AGENT_REGISTRY:
-        raise ValueError(
-            f"Invalid agent: {args.agent}. Available agents: {sorted(AGENT_REGISTRY)}"
-        )
-
     # Parse overriden params.
     for param in args.params:
         fqn_key, value = param, ""
@@ -130,12 +123,6 @@ def load_config():
             entry_to_change = entry_to_change[k]
         entry_to_change[keys[-1]] = yaml.safe_load(value)
 
-    # assume agent type is the key if not specified by the user
-    if "agent" not in config:
-        config["agent"] = {}
-    if "type" not in config["agent"]:
-        config["agent"]["type"] = args.agent
-
     return config, args
 
 
diff --git a/debug_gym/experiment.py b/debug_gym/experiment.py
index 5d438f45..e54bb73e 100644
--- a/debug_gym/experiment.py
+++ b/debug_gym/experiment.py
@@ -27,7 +27,7 @@ def create_env(config: dict, task_data: dict, logger: DebugGymLogger):
 
 def add_tools(env, config: dict, logger: DebugGymLogger):
     """Add tools to the environment"""
-    for tool in config["tools"]:
+    for tool in config.get("tools", []):
         tool_config = {}
         if isinstance(tool, dict):
             assert len(tool) == 1, "Tool dict must have exactly one key"
diff --git a/tests/agents/test_base_agent.py b/tests/agents/test_base_agent.py
index e5c559f1..176b90a4 100644
--- a/tests/agents/test_base_agent.py
+++ b/tests/agents/test_base_agent.py
@@ -57,8 +57,8 @@ def test_create_agent():
     class TestRegisteredAgent(BaseAgent):
         name = "test_registered"
 
-        def __init__(self, agent_args, env, **kwargs):
-            super().__init__(agent_args, env, **kwargs)
+        def __init__(self, agent_args, **kwargs):
+            super().__init__(agent_args, **kwargs)
 
     # Clear and setup registry
     original_registry = AGENT_REGISTRY.copy()
@@ -68,17 +68,15 @@ def __init__(self, agent_args, env, **kwargs):
     try:
         # Mock the required parameters
         mock_config = {
+            "type": "test_registered",
             "max_steps": 5,
         }
-        agent_args = AgentArgs.from_dict(mock_config)
-        mock_env = MagicMock()
-
-        agent = create_agent("test_registered", agent_args=agent_args, env=mock_env)
+        agent = create_agent(mock_config)
         assert isinstance(agent, TestRegisteredAgent)
 
         # Test unknown agent type
         with pytest.raises(ValueError, match="Unknown agent type: unknown_agent"):
-            create_agent("unknown_agent", agent_args=agent_args, env=mock_env)
+            create_agent({"type": "unknown_agent"})
 
         # Test module import (mock importlib)
         with patch("importlib.import_module") as mock_import:
@@ -86,9 +84,7 @@ def __init__(self, agent_args, env, **kwargs):
             mock_module.TestClass = TestRegisteredAgent
             mock_import.return_value = mock_module
 
-            agent = create_agent(
-                "some.module.TestClass", agent_args=agent_args, env=mock_env
-            )
+            agent = create_agent({"type": "some.module.TestClass"})
             assert isinstance(agent, TestRegisteredAgent)
             mock_import.assert_called_once_with("some.module")
     finally:
@@ -124,10 +120,8 @@ def test_to_pretty_json():
 
 def test_load_prompt_template_with_filters(tmp_path):
     """Test system prompt template loading with custom filters"""
-    llm = MagicMock()
-    llm.context_length = 2000
-    llm.count_tokens = Mock(return_value=500)
-    agent = BaseAgent(llm=llm)
+    agent = BaseAgent()
+    agent.llm = Human()
     agent.system_prompt = "Test task"
 
     # Create template that uses custom filters
@@ -195,7 +189,8 @@ def test_build_system_prompt_with_template_file(tmp_path):
 
 
 def test_build_instance_prompt_with_no_template():
-    agent = BaseAgent(llm=Human())
+    agent = BaseAgent()
+    agent.llm = Human()
 
     mock_info = MagicMock()
     mock_info.instructions = "Test instructions."
@@ -208,7 +203,8 @@ def test_build_instance_prompt_with_no_template():
 
 def test_build_instance_prompt_provided_in_args():
     instance_prompt = "Custom instance prompt"
-    agent = BaseAgent(agent_args={"instance_prompt": instance_prompt}, llm=Human())
+    agent = BaseAgent(agent_args={"instance_prompt": instance_prompt})
+    agent.llm = Human()
     assert agent.instance_prompt == instance_prompt
     instance_message = agent.build_instance_prompt()
     assert sorted(instance_message.keys()) == ["content", "role"]
@@ -218,9 +214,8 @@ def test_build_instance_prompt_provided_in_args():
 
 def test_build_instance_prompt_with_template():
     instance_prompt_template = "Your Mission: {{ info.instructions }}"
-    agent = BaseAgent(
-        agent_args={"instance_prompt": instance_prompt_template}, llm=Human()
-    )
+    agent = BaseAgent(agent_args={"instance_prompt": instance_prompt_template})
+    agent.llm = Human()
 
     mock_info = MagicMock()
     mock_info.instructions = "If you choose to accept it."
@@ -235,9 +230,8 @@ def test_build_instance_prompt_with_template_file(tmp_path):
     instance_prompt_template = "Your Mission: {{ info.instructions }}"
     instance_prompt_template_file = tmp_path / "instance_prompt.jinja"
     instance_prompt_template_file.write_text(instance_prompt_template)
-    agent = BaseAgent(
-        agent_args={"instance_prompt": instance_prompt_template_file}, llm=Human()
-    )
+    agent = BaseAgent(agent_args={"instance_prompt": instance_prompt_template_file})
+    agent.llm = Human()
     mock_info = MagicMock()
     mock_info.instructions = "If you choose to accept it."
 
diff --git a/tests/agents/test_froggy_agents.py b/tests/agents/test_froggy_agents.py
index 2eb1679b..a937053b 100644
--- a/tests/agents/test_froggy_agents.py
+++ b/tests/agents/test_froggy_agents.py
@@ -149,7 +149,7 @@ def test_run(agent_setup, build_env_info):
         tool=ToolCall(id="tool_id", name="tool_name", arguments={}),
         token_usage=TokenUsage(2, 4),
     )
-    result = agent.run(env, debug=False)
+    result = agent.run(env, llm, debug=False)
     assert result
 
 
@@ -243,7 +243,7 @@ def test_run_early_completion(agent_setup, build_env_info):
         step_observation="Test last run obs",
     )
 
-    result = agent.run(env)
+    result = agent.run(env, llm)
     assert result["success"] is True
     env.step.assert_not_called()  # Should not step if already done
 
@@ -282,7 +282,7 @@ def test_run_stops_at_max_steps(agent_setup, build_env_info):
         response_token_count=4,
     )
 
-    result = agent.run(env)
+    result = agent.run(env, llm)
     assert result["success"] is False
     assert env.step.call_count == 1
 
@@ -305,7 +305,7 @@ def test_run_exception_handling(agent_setup, build_env_info):
     llm.side_effect = RuntimeError("Test error")
 
     with pytest.raises(RuntimeError, match="Test error"):
-        agent.run(env)
+        agent.run(env, llm)
 
 
 def test_save_patch(agent_setup, tmp_path):
diff --git a/tests/agents/test_utils.py b/tests/agents/test_utils.py
index 58530f33..d44639b9 100644
--- a/tests/agents/test_utils.py
+++ b/tests/agents/test_utils.py
@@ -14,20 +14,15 @@ def test_load_config():
     # do the test in a tmp folder
     tempdir = tempfile.TemporaryDirectory(prefix="TestLoadConfig-")
     working_dir = Path(tempdir.name)
-    config_file = working_dir / "config.yaml"
+    config_file = str(working_dir / "config.yaml")
     atexit.register(tempdir.cleanup)  # Make sure to cleanup that folder once done.
 
-    config_contents = {}
-    config_contents["base"] = {
-        "random_seed": 42,
-        "max_steps": 100,
-    }
-    config_contents["pdb_agent"] = {
-        "llm_name": "gpt2",
-    }
-    config_contents["edit_only"] = {
-        "cot_style": "standard",
-        "llm_name": "gpt20",
+    config_contents = {
+        "agent": {
+            "max_steps": 100,
+            "type": "pdb_agent",
+        },
+        "llm": {"name": "gpt2"},
     }
 
     # write the config file into yaml
@@ -35,53 +30,45 @@ def test_load_config():
         yaml.dump(config_contents, f)
 
     # now test
-    with patch(
-        "sys.argv",
-        [
-            "config_file",
-            str(config_file),
-            "--agent",
-            "pdb_agent",
-            "-p",
-            "random_seed=123",
-            "-v",
-            "--debug",
-        ],
-    ):
-        _config, _args = load_config()
-    assert _args.agent == "pdb_agent"
+    args = [
+        "--config",
+        config_file,
+        "-v",
+        "--debug",
+    ]
+    _config, _args = load_config(args)
     expected_config = {
-        "agent_type": "pdb_agent",
-        "random_seed": 123,
-        "max_steps": 100,
-        "llm_name": "gpt2",
+        "agent": {
+            "type": "pdb_agent",
+            "max_steps": 100,
+        },
+        "llm": {"name": "gpt2"},
     }
     assert _config == expected_config
     assert _args.debug is True
     assert _args.logging_level == logging.INFO
 
     # another test
-    with patch(
-        "sys.argv",
-        [
-            "config_file",
-            str(config_file),
-            "--agent",
-            "edit_only",
-            "-p",
-            "random_seed=456",
-            "-v",
-            "--debug",
-        ],
-    ):
-        _config, _args = load_config()
-    assert _args.agent == "edit_only"
+    args = [
+        "--config",
+        config_file,
+        "-p",
+        "agent.type=edit_only",
+        "random_seed=456",
+        "cot_style=standard",
+        "llm.name=gpt20",
+        "-v",
+        "--debug",
+    ]
+    _config, _args = load_config(args)
     expected_config = {
-        "agent_type": "edit_only",
+        "agent": {
+            "type": "edit_only",
+            "max_steps": 100,
+        },
         "random_seed": 456,
-        "max_steps": 100,
         "cot_style": "standard",
-        "llm_name": "gpt20",
+        "llm": {"name": "gpt20"},
     }
     assert _config == expected_config
     assert _args.debug is True
diff --git a/tests/gym/envs/test_free_env.py b/tests/gym/envs/test_free_env.py
index 99fd3ecd..e69de29b 100644
--- a/tests/gym/envs/test_free_env.py
+++ b/tests/gym/envs/test_free_env.py
@@ -1,133 +0,0 @@
-from pathlib import Path
-from typing import Any
-from unittest.mock import MagicMock
-
-from debug_gym.gym.envs.free_env import FreeEnv
-from debug_gym.gym.terminals.local import LocalTerminal
-from debug_gym.gym.terminals.terminal import Terminal
-
-
-class DummyTerminal(Terminal):
-    """Test helper terminal with minimal behavior for FreeEnv interactions."""
-
-    def __init__(
-        self,
-        *,
-        working_dir: str = "/tmp/test",
-        logger: Any | None = None,
-        base_image: str | None = None,
-        setup_commands: list[str] | None = None,
-    ):
-        super().__init__(working_dir=working_dir, logger=logger)
-        self.base_image = base_image
-        self.setup_commands = list(setup_commands or [])
-        self.closed = False
-
-    def prepare_command(self, entrypoint):
-        return ["/bin/true"]
-
-    def run(self, entrypoint, timeout=None, raises=False, strip_output=True):
-        if isinstance(entrypoint, str) and "tree" in entrypoint:
-            return True, "/workspace\n"
-        return True, ""
-
-    @property
-    def default_shell_command(self):
-        return "/bin/true"
-
-    def new_shell_session(self):
-        return None
-
-    def copy_content(self, src, target=None):
-        return None
-
-    def close(self):
-        self.closed = True
-
-
-def test_free_env_defaults_to_local_terminal():
-    logger = MagicMock()
-
-    env = FreeEnv(image="ubuntu:22.04", logger=logger)
-
-    assert isinstance(env.terminal, LocalTerminal)
-    assert env.container_image == "ubuntu:22.04"
-
-
-def test_free_env_configures_existing_terminal():
-    logger = MagicMock()
-    terminal_logger = MagicMock()
-    terminal = DummyTerminal(
-        working_dir="/initial",
-        logger=terminal_logger,
-        base_image="base",
-        setup_commands=["existing"],
-    )
-
-    env = FreeEnv(
-        image="ubuntu:22.04",
-        terminal=terminal,
-        setup_commands=["apt update"],
-        workspace_dir="/workspace",
-        logger=logger,
-        init_git=False,
-    )
-
-    env.reset()
-
-    assert env.terminal is terminal
-    assert terminal.base_image == "ubuntu:22.04"
-    assert terminal.working_dir == "/workspace"
-    assert terminal.logger is logger
-    assert terminal.setup_commands == ["apt update"]
-
-
-def test_free_env_respects_custom_workspace(tmp_path):
-    logger = MagicMock()
-    terminal = DummyTerminal(logger=logger)
-
-    env = FreeEnv(
-        image="ubuntu:22.04",
-        terminal=terminal,
-        workspace_dir="/workspace",
-        logger=logger,
-        init_git=False,
-    )
-
-    env.reset()
-
-    assert env.workspace.working_dir == Path("/workspace")
-    assert terminal.working_dir == "/workspace"
-
-
-def test_free_env_reset_allows_dynamic_overrides():
-    logger = MagicMock()
-    terminal = DummyTerminal(logger=logger, setup_commands=["initial"])
-
-    env = FreeEnv(
-        image="ubuntu:22.04",
-        terminal=terminal,
-        setup_commands=["initial"],
-        workspace_dir="/workspace",
-        logger=logger,
-        init_git=True,
-    )
-
-    env.reset(
-        options={
-            "image": "ubuntu:24.04",
-            "workspace_dir": "/new",
-            "setup_commands": ["echo ready"],
-            "instructions": "Inspect carefully.",
-            "init_git": False,
-        }
-    )
-
-    assert env.container_image == "ubuntu:24.04"
-    assert env.instructions == "Inspect carefully."
-    assert env.init_git is False
-    assert env._workspace_dir == "/new"
-    assert terminal.working_dir == "/new"
-    assert terminal.setup_commands == ["echo ready"]
-    assert terminal.base_image == "ubuntu:24.04"
-    assert terminal.closed is True
diff --git a/tests/gym/tools/test_pdb.py b/tests/gym/tools/test_pdb.py
index 9c337b5e..50955af2 100644
--- a/tests/gym/tools/test_pdb.py
+++ b/tests/gym/tools/test_pdb.py
@@ -1,7 +1,5 @@
 import copy
-import platform
 import re
-import subprocess
 from unittest.mock import MagicMock
 
 import pytest
@@ -12,26 +10,6 @@
 from debug_gym.gym.tools.pdb import PDBTool
 
 
-def is_docker_running():
-    try:
-        subprocess.check_output(["docker", "ps"])
-        return True
-    except Exception:
-        return False
-
-
-if_docker_running = pytest.mark.skipif(
-    not is_docker_running(),
-    reason="Docker not running",
-)
-
-
-if_is_linux = pytest.mark.skipif(
-    platform.system() != "Linux",
-    reason="Interactive ShellSession (pty) requires Linux.",
-)
-
-
 def clean_up_pytest_path(obs):
     """clean up the pytest path to not depend on the env"""
     return re.sub(
@@ -176,8 +154,8 @@ def test_pdb_use_default_env_entrypoint(tmp_path, setup_test_repo):
     assert "(Pdb)" not in output
 
 
-@if_is_linux
-@if_docker_running
+@pytest.if_is_linux
+@pytest.if_docker_running
 def test_pdb_use_docker_terminal(tmp_path, setup_test_repo):
     """Test PDBTool similar to test_pdb_use but using DockerTerminal"""
     tests_path = str(setup_test_repo(tmp_path))

From 5a8f2795abbe641002707926f380ee83c75ef52a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Wed, 10 Dec 2025 11:07:23 -0800
Subject: [PATCH 08/12] More tests

---
 debug_gym/gym/terminals/kubernetes.py |   2 +-
 tests/gym/envs/test_free_env.py       | 400 ++++++++++++++++++++++++++
 2 files changed, 401 insertions(+), 1 deletion(-)

diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py
index f89e0fb9..96715a35 100644
--- a/debug_gym/gym/terminals/kubernetes.py
+++ b/debug_gym/gym/terminals/kubernetes.py
@@ -298,7 +298,7 @@ def __init__(
         self.registry = registry.rstrip("/") + "/" if registry else ""
         self._pod_name = pod_name
         self.pod_spec_kwargs = pod_spec_kwargs or {}
-        user = _clean_for_kubernetes(os.environ.get("USER", "unknown"))
+        user = _clean_for_kubernetes(os.environ.get("USER", "unknown").split("@")[0])
         self.labels = {"app": "dbg-gym", "user": user} | (extra_labels or {})
         self._pod = None
 
diff --git a/tests/gym/envs/test_free_env.py b/tests/gym/envs/test_free_env.py
index e69de29b..f09cc880 100644
--- a/tests/gym/envs/test_free_env.py
+++ b/tests/gym/envs/test_free_env.py
@@ -0,0 +1,400 @@
+"""Unit tests for FreeEnv without mocking."""
+
+import shutil
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from debug_gym.gym.envs.free_env import FreeEnv
+from debug_gym.gym.terminals.docker import DockerTerminal
+from debug_gym.gym.terminals.local import LocalTerminal
+from debug_gym.gym.tools.toolbox import Toolbox
+from debug_gym.logger import DebugGymLogger
+
+
+@pytest.fixture
+def test_repo(tmp_path):
+    """Create a simple test repository structure."""
+    repo_path = tmp_path / "test_repo"
+    repo_path.mkdir()
+
+    # Create some test files
+    (repo_path / "main.py").write_text(
+        "def greet(name):\n    return f'Hello, {name}!'\n\nif __name__ == '__main__':\n    print(greet('World'))\n"
+    )
+    (repo_path / "test_main.py").write_text(
+        "from main import greet\n\ndef test_greet():\n    assert greet('Alice') == 'Hello, Alice!'\n"
+    )
+    (repo_path / "requirements.txt").write_text("pytest\n")
+    (repo_path / "README.md").write_text("# Test Project\n\nA simple test project.\n")
+
+    # Create a subdirectory
+    subdir = repo_path / "utils"
+    subdir.mkdir()
+    (subdir / "__init__.py").write_text("")
+    (subdir / "helper.py").write_text("def add(a, b):\n    return a + b\n")
+
+    return repo_path
+
+
+@pytest.fixture
+def logger():
+    """Create a test logger."""
+    return DebugGymLogger("test_free_env")
+
+
+class TestFreeEnvInitialization:
+    """Test FreeEnv initialization with different configurations."""
+
+    def test_init_default_terminal(self, test_repo, logger):
+        """Test that default terminal is DockerTerminal."""
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+        }
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        assert isinstance(env.terminal, DockerTerminal)
+        assert env.task_data == task_data
+        assert env.logger == logger
+
+    def test_init_with_local_terminal_raises_error(self, test_repo, logger):
+        """Test that LocalTerminal raises ValueError."""
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+        }
+        terminal = LocalTerminal()
+
+        with pytest.raises(
+            ValueError, match="only supports DockerTerminal and KubernetesTerminal"
+        ):
+            FreeEnv(task_data=task_data, terminal=terminal, logger=logger)
+
+    def test_init_default_setup_commands(self, test_repo, logger):
+        """Test that default setup_commands are set when not provided."""
+        # When task_data is None, kwargs are used to create task_data
+        env = FreeEnv(
+            image="python:3.11",
+            local_path=str(test_repo),
+            logger=logger,
+        )
+
+        assert "setup_commands" in env.task_data
+        assert "apt-get update" in env.task_data["setup_commands"][0]
+        assert "git tree" in env.task_data["setup_commands"][0]
+
+
+class TestFreeEnvProperties:
+    """Test FreeEnv properties."""
+
+    def test_task_name_property(self, test_repo, logger):
+        """Test task_name property returns correct format."""
+        task_data = {
+            "image": "python:3.11-slim",
+            "local_path": str(test_repo),
+        }
+        terminal = LocalTerminal()
+
+        # Bypass terminal type check for testing
+        env = FreeEnv.__new__(FreeEnv)
+        env.task_data = task_data
+        env.terminal = terminal
+        env.logger = logger
+
+        assert env.task_name == "FreeEnv(python:3.11-slim)"
+
+    def test_instructions_property(self, test_repo, logger):
+        """Test instructions property returns guidance text."""
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+        }
+        terminal = LocalTerminal()
+
+        env = FreeEnv.__new__(FreeEnv)
+        env.task_data = task_data
+        env.terminal = terminal
+        env.logger = logger
+
+        instructions = env.instructions
+        assert "isolated Linux environment" in instructions
+        assert "available tools" in instructions
+
+
+class TestFreeEnvSetupTask:
+    """Test FreeEnv task setup."""
+
+    def test_setup_task(self, test_repo, logger):
+        """Test setup_task configures terminal correctly."""
+        task_data = {
+            "image": "python:3.11-alpine",
+            "local_path": str(test_repo),
+        }
+        terminal = LocalTerminal()
+
+        env = FreeEnv.__new__(FreeEnv)
+        env.task_data = task_data
+        env.terminal = terminal
+        env.logger = logger
+
+        env.setup_task()
+
+        assert env.terminal.task_name == "FreeEnv(python:3.11-alpine)"
+        assert env.terminal.base_image == "python:3.11-alpine"
+
+
+class TestFreeEnvSetupWorkspace:
+    """Test FreeEnv workspace setup."""
+
+    def test_setup_workspace_method_exists(self, test_repo, logger):
+        """Test that setup_workspace method exists."""
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+            "workspace_dir": "/workspace",
+        }
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        # Verify method exists and is callable
+        assert hasattr(env, "setup_workspace")
+        assert callable(env.setup_workspace)
+
+    def test_setup_workspace_sets_working_dir(self, test_repo, logger):
+        """Test that setup_workspace uses the workspace_dir from task_data."""
+        workspace_dir = "/custom/testbed"
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+            "workspace_dir": workspace_dir,
+        }
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        # The workspace_dir should be in task_data
+        assert env.task_data.get("workspace_dir") == workspace_dir
+
+
+class TestFreeEnvSetupTerminal:
+    """Test FreeEnv terminal setup."""
+
+    def test_setup_terminal_with_git_available(self, test_repo, logger):
+        """Test terminal setup when git is available."""
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+            "setup_commands": ["echo 'setup complete'"],
+        }
+        terminal = LocalTerminal()
+
+        env = FreeEnv.__new__(FreeEnv)
+        env.task_data = task_data
+        env.terminal = terminal
+        env.logger = logger
+
+        # Test _git_available method
+        result = env._git_available()
+        # On most systems, git should be available
+        assert isinstance(result, bool)
+
+    def test_git_available_with_none_terminal(self, logger):
+        """Test _git_available returns False when terminal is None."""
+        task_data = {"image": "python:3.11"}
+
+        env = FreeEnv.__new__(FreeEnv)
+        env.task_data = task_data
+        env.terminal = None
+        env.logger = logger
+
+        assert env._git_available() is False
+
+
+class TestFreeEnvIntegration:
+    """Integration tests for FreeEnv."""
+
+    def test_add_tools(self, test_repo, logger):
+        """Test adding tools to FreeEnv."""
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+        }
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        # Add a tool
+        bash_tool = Toolbox.get_tool("bash")
+        env.add_tool(bash_tool)
+
+        assert len(env.tools) == 1
+        assert env.tools[0].name == "bash"
+
+        # Add another tool
+        view_tool = Toolbox.get_tool("view")
+        env.add_tool(view_tool)
+
+        assert len(env.tools) == 2
+        tool_names = [tool.name for tool in env.tools]
+        assert "bash" in tool_names
+        assert "view" in tool_names
+
+    def test_multiple_workspace_dirs(self, test_repo, logger):
+        """Test different workspace_dir values."""
+        workspace_dirs = ["/testbed", "/workspace", "/app", "/code"]
+
+        for workspace_dir in workspace_dirs:
+            task_data = {
+                "image": "python:3.11",
+                "local_path": str(test_repo),
+                "workspace_dir": workspace_dir,
+            }
+
+            env = FreeEnv(task_data=task_data, logger=logger)
+
+            assert env.task_data["workspace_dir"] == workspace_dir
+
+    def test_various_images(self, test_repo, logger):
+        """Test FreeEnv with various Docker images."""
+        images = [
+            "python:3.11",
+            "python:3.11-slim",
+            "python:3.11-alpine",
+            "python:3.10",
+            "ubuntu:22.04",
+        ]
+
+        for image in images:
+            task_data = {
+                "image": image,
+                "local_path": str(test_repo),
+            }
+
+            env = FreeEnv(task_data=task_data, logger=logger)
+
+            assert env.task_name == f"FreeEnv({image})"
+
+
+class TestFreeEnvEdgeCases:
+    """Test edge cases and error conditions."""
+
+    def test_minimal_task_data(self, logger):
+        """Test with minimal task_data."""
+        task_data = {"image": "python:3.11"}
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        # Should have the image
+        assert env.task_data["image"] == "python:3.11"
+
+    def test_kwargs_used_when_no_task_data(self, test_repo, logger):
+        """Test that kwargs are used to construct task_data when task_data is None."""
+        env = FreeEnv(
+            image="python:3.11",
+            local_path=str(test_repo),
+            workspace_dir="/custom",
+            setup_commands=["echo test"],
+            logger=logger,
+        )
+
+        # Kwargs should be used to create task_data
+        assert env.task_data["image"] == "python:3.11"
+        assert env.task_data["local_path"] == str(test_repo)
+        assert env.task_data["workspace_dir"] == "/custom"
+        assert env.task_data["setup_commands"] == ["echo test"]
+
+    def test_task_data_priority_over_kwargs(self, test_repo, logger):
+        """Test that task_data takes priority over kwargs."""
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+            "workspace_dir": "/from_task_data",
+        }
+
+        env = FreeEnv(
+            task_data=task_data,
+            image="python:3.10",  # Should be ignored
+            workspace_dir="/from_kwargs",  # Should be ignored
+            logger=logger,
+        )
+
+        # task_data values should be used
+        assert env.task_data["image"] == "python:3.11"
+        assert env.task_data["workspace_dir"] == "/from_task_data"
+
+    def test_nonexistent_local_path(self, logger):
+        """Test with a non-existent local_path."""
+        task_data = {
+            "image": "python:3.11",
+            "local_path": "/nonexistent/path/that/does/not/exist",
+            "workspace_dir": "/testbed",
+        }
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        # Should not crash during initialization
+        assert env.task_data["local_path"] == "/nonexistent/path/that/does/not/exist"
+        # The error would occur during setup_workspace when trying to copy
+
+    def test_custom_setup_commands(self, test_repo, logger):
+        """Test with custom setup commands."""
+        custom_commands = [
+            "apt-get update",
+            "apt-get install -y vim",
+            "pip install numpy pandas",
+        ]
+
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+            "setup_commands": custom_commands,
+        }
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        assert env.task_data["setup_commands"] == custom_commands
+
+    def test_empty_setup_commands(self, test_repo, logger):
+        """Test with empty setup commands."""
+        task_data = {
+            "image": "python:3.11",
+            "local_path": str(test_repo),
+            "setup_commands": [],
+        }
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        assert env.task_data["setup_commands"] == []
+
+
+@pytest.if_docker_running
+class TestFreeEnvWithDocker:
+    """Tests that require Docker to be running."""
+
+    def test_init_with_docker_terminal(self, test_repo, logger):
+        """Test initialization with DockerTerminal."""
+        task_data = {
+            "image": "python:3.11-slim",
+            "local_path": str(test_repo),
+        }
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        assert isinstance(env.terminal, DockerTerminal)
+        assert env.terminal.base_image is None  # Not set until setup_task
+
+    def test_full_initialization_with_docker(self, test_repo, logger):
+        """Test full FreeEnv initialization with Docker."""
+        task_data = {
+            "image": "python:3.11-slim",
+            "local_path": str(test_repo),
+            "workspace_dir": "/testbed",
+            "setup_commands": ["apt-get update && apt-get install -y git"],
+        }
+
+        env = FreeEnv(task_data=task_data, logger=logger)
+
+        assert env.task_data == task_data
+        assert isinstance(env.terminal, DockerTerminal)
+        assert env.task_name == "FreeEnv(python:3.11-slim)"

From 7374576afa3f50d9c844f4496f6e6ea51294fd19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Wed, 10 Dec 2025 11:13:17 -0800
Subject: [PATCH 09/12] Fix

---
 debug_gym/llms/base.py  |  8 ++------
 tests/llms/test_base.py | 13 ++++---------
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/debug_gym/llms/base.py b/debug_gym/llms/base.py
index 2df00498..93f8a288 100644
--- a/debug_gym/llms/base.py
+++ b/debug_gym/llms/base.py
@@ -225,7 +225,7 @@ def instantiate(
         """Creates an instance of the appropriate LLM class based on the configuration.
 
         Args:
-            llm_name: Name of the LLM model to instantiate.
+            config: Dictionary containing LLM configuration, must include 'name' key.
             llm_config_file_path: Optional path to the LLM configuration file.
             logger: Optional DebugGymLogger for logging.
 
@@ -235,16 +235,12 @@ def instantiate(
 
         logger = logger or DebugGymLogger("debug-gym")
         llm_name = config.get("name")
-        if not llm_name:
-            return None
-
-        elif llm_name == "human":
+        if llm_name == "human":
             from debug_gym.llms import Human
 
             return Human(llm_name, logger=logger)
 
         llm_config = LLMConfigRegistry.from_file(llm_config_file_path)[llm_name]
-        llm_config.update(config)
 
         tags = llm_config.tags
 
diff --git a/tests/llms/test_base.py b/tests/llms/test_base.py
index 7a882cc2..f8ef05b3 100644
--- a/tests/llms/test_base.py
+++ b/tests/llms/test_base.py
@@ -66,9 +66,6 @@
     ),
 )
 def test_instantiate_llm(mock_open, logger_mock):
-    llm = LLM.instantiate(None, logger=logger_mock)
-    assert llm is None
-
     # tags are used to filter models
     llm = LLM.instantiate({"name": "gpt-4o-mini"}, logger=logger_mock)
     assert isinstance(llm, OpenAILLM)
@@ -359,7 +356,7 @@ def test_token_usage_initialization():
         }
     ),
 )
-def test_llm_call_with_generate_kwargs(mock_llm_config, logger_mock, llm_class_mock):
+def test_llm_call_with_generate_kwargs(logger_mock, llm_class_mock):
     messages = [{"role": "user", "content": "Hello"}]
     llm_mock = llm_class_mock("llm-mock", logger=logger_mock)
     llm_response = llm_mock(messages, tools)
@@ -387,9 +384,7 @@ def test_llm_call_with_generate_kwargs(mock_llm_config, logger_mock, llm_class_m
         }
     ),
 )
-def test_llm_call_override_generate_kwargs(
-    mock_llm_config, logger_mock, llm_class_mock
-):
+def test_llm_call_override_generate_kwargs(logger_mock, llm_class_mock):
     messages = [{"role": "user", "content": "Hello"}]
     llm_mock = llm_class_mock("llm-mock", logger=logger_mock)
     # Override the temperature from config
@@ -413,7 +408,7 @@ def test_llm_call_override_generate_kwargs(
         }
     ),
 )
-def test_llm_call_ignore_kwargs(mock_llm_config, logger_mock, llm_class_mock):
+def test_llm_call_ignore_kwargs(logger_mock, llm_class_mock):
     messages = [{"role": "user", "content": "Hello"}]
     llm_mock = llm_class_mock("llm-mock", logger=logger_mock)
     llm_mock(messages, tools, temperature=0.7, max_tokens=10)
@@ -509,7 +504,7 @@ def test_llm_init_with_both_config_types(logger_mock, llm_class_mock):
     ),
 )
 def test_context_length_exceeded_prevents_infinite_recursion(
-    mock_llm_config, logger_mock, llm_class_mock
+    llm_mock, logger_mock, llm_class_mock
 ):
     """Test that ContextLengthExceededError handling prevents infinite recursion."""
 

From 4612498eb204c0d2409b5bb684b47f04f359c26d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Wed, 10 Dec 2025 11:47:13 -0800
Subject: [PATCH 10/12] Adding tests for select_env and load_dataset

---
 tests/gym/envs/test_init.py | 210 ++++++++++++++++++++++++++++++++++++
 1 file changed, 210 insertions(+)
 create mode 100644 tests/gym/envs/test_init.py

diff --git a/tests/gym/envs/test_init.py b/tests/gym/envs/test_init.py
new file mode 100644
index 00000000..37408215
--- /dev/null
+++ b/tests/gym/envs/test_init.py
@@ -0,0 +1,210 @@
+"""Unit tests for select_env and load_dataset functions without mocking."""
+
+import pytest
+
+from debug_gym.gym.envs import load_dataset, select_env
+from debug_gym.gym.envs.aider import AiderBenchmarkEnv
+from debug_gym.gym.envs.free_env import FreeEnv
+from debug_gym.gym.envs.local import LocalEnv
+from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
+from debug_gym.gym.envs.r2egym import R2EGymEnv
+from debug_gym.gym.envs.swe_bench import SWEBenchEnv
+from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv
+from debug_gym.gym.envs.swe_smith import SWESmithEnv
+from debug_gym.logger import DebugGymLogger
+
+
+class TestSelectEnv:
+    """Test the select_env function with all environment types."""
+
+    def test_select_local_env(self):
+        """Test selecting LocalEnv."""
+        env_class = select_env("local")
+        assert env_class == LocalEnv
+
+    def test_select_aider_env(self):
+        """Test selecting AiderBenchmarkEnv."""
+        env_class = select_env("aider")
+        assert env_class == AiderBenchmarkEnv
+
+    def test_select_swebench_env(self):
+        """Test selecting SWEBenchEnv."""
+        env_class = select_env("swebench")
+        assert env_class == SWEBenchEnv
+
+    def test_select_swebench_debug_env(self):
+        """Test selecting SWEBenchDebugEnv."""
+        env_class = select_env("swebench-debug")
+        assert env_class == SWEBenchDebugEnv
+
+    def test_select_swesmith_env(self):
+        """Test selecting SWESmithEnv."""
+        env_class = select_env("swesmith")
+        assert env_class == SWESmithEnv
+
+    def test_select_mini_nightmare_env(self):
+        """Test selecting MiniNightmareEnv."""
+        env_class = select_env("mini_nightmare")
+        assert env_class == MiniNightmareEnv
+
+    def test_select_r2egym_env(self):
+        """Test selecting R2EGymEnv."""
+        env_class = select_env("r2egym")
+        assert env_class == R2EGymEnv
+
+    def test_select_free_env(self):
+        """Test selecting FreeEnv."""
+        env_class = select_env("FreeEnv")
+        assert env_class == FreeEnv
+
+    def test_select_unknown_env(self):
+        """Test that selecting unknown env raises ValueError."""
+        with pytest.raises(ValueError, match="Unknown environment unknown_env"):
+            select_env("unknown_env")
+
+    def test_select_none_env(self):
+        """Test that selecting None env raises ValueError."""
+        with pytest.raises(ValueError, match="Unknown environment None"):
+            select_env(None)
+
+    def test_select_empty_string_env(self):
+        """Test that selecting empty string env raises ValueError."""
+        with pytest.raises(ValueError, match="Unknown environment"):
+            select_env("")
+
+    def test_select_env_case_sensitive(self):
+        """Test that env selection is case-sensitive."""
+        # "local" works
+        assert select_env("local") == LocalEnv
+
+        # "Local" should fail
+        with pytest.raises(ValueError, match="Unknown environment Local"):
+            select_env("Local")
+
+    def test_all_env_types_are_classes(self):
+        """Test that all returned env types are actually classes."""
+        env_types = [
+            "local",
+            "aider",
+            "swebench",
+            "swebench-debug",
+            "swesmith",
+            "mini_nightmare",
+            "r2egym",
+            "FreeEnv",
+        ]
+
+        for env_type in env_types:
+            env_class = select_env(env_type)
+            assert isinstance(env_class, type), f"{env_type} should return a class"
+
+
+class TestLoadDataset:
+    """Test the load_dataset function."""
+
+    def test_load_dataset_missing_type(self):
+        """Test that load_dataset raises ValueError when 'type' is missing."""
+        config = {"some_key": "some_value"}
+
+        with pytest.raises(
+            ValueError, match="Dataset config must specify 'type' field"
+        ):
+            load_dataset(config)
+
+    def test_load_dataset_type_none(self):
+        """Test that load_dataset raises ValueError when 'type' is None."""
+        config = {"type": None}
+
+        with pytest.raises(
+            ValueError, match="Dataset config must specify 'type' field"
+        ):
+            load_dataset(config)
+
+    def test_load_dataset_unknown_type(self):
+        """Test that load_dataset raises ValueError for unknown environment type."""
+        config = {"type": "nonexistent_env"}
+
+        with pytest.raises(
+            ValueError,
+            match="Unknown environment type 'nonexistent_env' from dataset's config",
+        ):
+            load_dataset(config)
+
+    def test_load_dataset_with_logger(self):
+        """Test that load_dataset accepts a logger parameter."""
+        logger = DebugGymLogger("test_logger")
+        config = {"type": "mini_nightmare", "build_image": False}
+
+        # This should not raise an error even if dataset loading fails
+        # We're just testing the function signature and error handling
+        try:
+            dataset = load_dataset(config, logger=logger)
+            # If it succeeds, check that it returns a dict
+            assert isinstance(dataset, dict)
+        except Exception:
+            # If it fails for other reasons (e.g., dataset not available),
+            # that's okay - we're testing the logger parameter acceptance
+            pass
+
+    def test_load_dataset_without_logger(self):
+        """Test that load_dataset works without logger parameter."""
+        config = {"type": "mini_nightmare", "build_image": False}
+
+        # This should not raise an error about missing logger
+        try:
+            dataset = load_dataset(config)
+            # If it succeeds, check that it returns a dict
+            assert isinstance(dataset, dict)
+        except Exception:
+            # If it fails for other reasons (e.g., dataset not available),
+            # that's okay - we're testing that logger is optional
+            pass
+
+    def test_load_dataset_passes_config_to_env(self):
+        """Test that load_dataset passes configuration to the environment's load_dataset method."""
+        # Using mini_nightmare as it has a simple load_dataset signature
+        config = {
+            "type": "mini_nightmare",
+            "build_image": False,
+            "problems": None,  # This should be passed to MiniNightmareEnv.load_dataset
+        }
+
+        try:
+            dataset = load_dataset(config)
+            # If successful, verify it's a dict (expected return type)
+            assert isinstance(dataset, dict)
+        except Exception:
+            # If it fails for other reasons, that's acceptable
+            pass
+
+    def test_load_dataset_error_message_includes_config(self):
+        """Test that error message includes the config when type is invalid."""
+        config = {"type": "invalid_type", "other_param": "value"}
+
+        with pytest.raises(ValueError) as exc_info:
+            load_dataset(config)
+
+        # Check that the error message includes the config
+        error_message = str(exc_info.value)
+        assert "invalid_type" in error_message
+        assert "config" in error_message.lower()
+
+    def test_load_dataset_preserves_value_error_from_select_env(self):
+        """Test that ValueError from select_env is properly wrapped."""
+        config = {"type": "bad_env"}
+
+        with pytest.raises(ValueError) as exc_info:
+            load_dataset(config)
+
+        # Check that it's chained from the original ValueError
+        error_message = str(exc_info.value)
+        assert "Unknown environment type 'bad_env'" in error_message
+
+    def test_load_dataset_empty_config(self):
+        """Test load_dataset with empty config dict."""
+        config = {}
+
+        with pytest.raises(
+            ValueError, match="Dataset config must specify 'type' field"
+        ):
+            load_dataset(config)

From 43978507f553853f99f917b00c83ff7f7620c507 Mon Sep 17 00:00:00 2001
From: "Xingdi (Eric) Yuan" <xingdi-eric-yuan@users.noreply.github.com>
Date: Wed, 10 Dec 2025 20:37:10 -0500
Subject: [PATCH 11/12] update configs, removing swe_agent.py

---
 configs/aider.yaml              |  62 +++++++++++---------
 configs/config.yaml             |  65 ++++++++++++---------
 configs/free_env.yaml           |   2 +-
 configs/mini_nightmare.yaml     |  63 +++++++++++---------
 configs/r2egym.yaml             |  92 +++++++++++++----------------
 configs/swebench.yaml           | 100 +++++++++++++-------------------
 configs/swebench_debug.yaml     |  94 +++++++++++++-----------------
 configs/swebench_swe_agent.yaml |  48 +++++++++++++++
 configs/swesmith.yaml           |  91 +++++++++++++----------------
 debug_gym/agents/__init__.py    |   2 -
 debug_gym/agents/swe_agent.py   |   7 ---
 11 files changed, 312 insertions(+), 314 deletions(-)
 create mode 100644 configs/swebench_swe_agent.yaml
 delete mode 100644 debug_gym/agents/swe_agent.py

diff --git a/configs/aider.yaml b/configs/aider.yaml
index fda35622..a1872d16 100644
--- a/configs/aider.yaml
+++ b/configs/aider.yaml
@@ -1,34 +1,38 @@
-base:
-    # Environment configs
-    output_path: "exps/aider"
+# Configuration for Aider benchmark runs.
+task_name: aider
+output_path: exps/aider
 
-    env:
-        run_timeout: 20
+llm_name: gpt-4o
 
-    terminal:
-        type: "docker"  # "docker", "kubernetes", or "local"
+# Tools to load into the environment toolbox.
+tools:
+  - bash
+  - view
+  - edit
+  - eval
 
-    dataset:
-        type: "aider"
-        problems: "all"  # list of problems, e.g., ["wordy"], or "all"
+task_data:
+  dataset_type: aider
+  problems: all  # list of problems, e.g., ["wordy"], or "all"
+  run_timeout: 20
 
-    # LLM configs
-    llm_name: "gpt-4o"
+terminal:
+  type: docker
 
-    # Agent configs
-    random_seed: 42
-    max_steps: 50
-    save_patch: True
-    # Optionally loads a custom system prompt template from a file.
-    # system_prompt: "script/templates/system_prompt.jinja"
-
-    # Shortcut features
-    "show_current_breakpoints": False  # If True, the environment will automatically show the current breakpoints at every step in the system prompt.
-
-froggy_agent:
-    tools:
-        - grep
-        - pdb
-        - view
-        - edit
-        - eval
+agent:
+  type: froggy
+  max_steps: 50
+  system_prompt: >-
+    You are a debugging agent specialized in fixing Python programs.
+    Your goal is to debug a Python program to make sure it can pass a set of hidden test functions.
+    You have access to a problem statement that describes the issue,
+    you also have access to a set of tools to help you investigate the code before proposing a patch.
+    While the code may seem familiar to you from your training, you should not assume you know the code.
+    Instead, you must investigate the code carefully to understand the potential bugs.
+    Once you have gained enough information, propose a patch to fix the bugs.
+    Avoid editing the entire code, focus on the bugs only.
+    You must make tool calls to interact with the environment, but you can only call one tool at a time.
+    Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know.
+    You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking.
+    Once you are done, use the submit tool to end the debugging session, note you can only call it once.
+    Output both your thinking process (if any) and the tool call (must) in the response.
diff --git a/configs/config.yaml b/configs/config.yaml
index bebe45d2..babc6198 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -1,32 +1,41 @@
-base:
-    # Environment configs
-    output_path: "exps/pytorch"
-    env:
-        type: "local"
-        path: "data/pytorch"
-        entrypoint: "python -m pytest -sv test.py"
-        debug_entrypoint: "python -m pdb -m pytest -s test.py"
-        run_timeout: 10
-    terminal:
-        type: "docker"  # "local", "docker", or "kubernetes"
+# Configuration for local PyTorch runs.
+task_name: pytorch
+output_path: exps/pytorch
 
-    # LLM configs
-    llm_name: "gpt-4o"
+llm_name: gpt-4o
 
-    # Agent configs
-    random_seed: 42
-    max_steps: 50
-    save_patch: True
-    # Optionally loads a custom system prompt template from a file.
-    # system_prompt: "script/templates/system_prompt.jinja"
+# Tools to load into the environment toolbox.
+tools:
+  - bash
+  - view
+  - pdb
+  - edit
+  - eval
 
-    # Shortcut features
-    "show_current_breakpoints": False  # If True, the environment will automatically show the current breakpoints at every step in the system prompt.
+task_data:
+  env_type: local
+  path: data/pytorch
+  entrypoint: python -m pytest -sv test.py
+  debug_entrypoint: python -m pdb -m pytest -s test.py
+  run_timeout: 10
 
-froggy_agent:
-    tools:
-        - grep
-        - pdb
-        - view
-        - edit
-        - eval
+terminal:
+  type: docker
+
+agent:
+  type: froggy
+  max_steps: 50
+  system_prompt: >-
+    You are a debugging agent specialized in fixing Python programs.
+    Your goal is to debug a Python program to make sure it can pass a set of hidden test functions.
+    You have access to a problem statement that describes the issue,
+    you also have access to a set of tools to help you investigate the code before proposing a patch.
+    While the code may seem familiar to you from your training, you should not assume you know the code.
+    Instead, you must investigate the code carefully to understand the potential bugs.
+    Once you have gained enough information, propose a patch to fix the bugs.
+    Avoid editing the entire code, focus on the bugs only.
+    You must make tool calls to interact with the environment, but you can only call one tool at a time.
+    Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know.
+    You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking.
+    Once you are done, use the submit tool to end the debugging session, note you can only call it once.
+    Output both your thinking process (if any) and the tool call (must) in the response.
diff --git a/configs/free_env.yaml b/configs/free_env.yaml
index 1feb0bbb..f642796e 100644
--- a/configs/free_env.yaml
+++ b/configs/free_env.yaml
@@ -23,7 +23,7 @@ terminal:
 agent:
   type: froggy
   max_steps: 20
-  system_prompt: |
+  system_prompt: >-
     You are assisting in an exploratory codebase understanding session inside an open-ended container.
     You have access to a set of tools to inspect and modify the codebase.
     Your goal is to use the tools to gather as much information about the codebase as possible.
diff --git a/configs/mini_nightmare.yaml b/configs/mini_nightmare.yaml
index 1fa373a9..36c638bd 100644
--- a/configs/mini_nightmare.yaml
+++ b/configs/mini_nightmare.yaml
@@ -1,34 +1,39 @@
-base:
-    # Environment configs
-    output_path: "exps/mini_nightmare"
+# Configuration for Mini Nightmare benchmark runs.
+task_name: mini_nightmare
+output_path: exps/mini_nightmare
 
-    env:
-        run_timeout: 30
+llm_name: gpt-4o
 
-    terminal:
-        type: "docker"  # "docker", "kubernetes", or "local"
+# Tools to load into the environment toolbox.
+tools:
+  - bash
+  - view
+  - pdb
+  - edit
+  - eval
 
-    dataset:
-        type: "mini_nightmare"
-        problems: "all"  # list of problems, e.g., ["config"], or "all"
+task_data:
+  dataset_type: mini_nightmare
+  problems: all  # list of problems, e.g., ["config"], or "all"
+  run_timeout: 30
 
-    # LLM configs
-    llm_name: "gpt-4o"
+terminal:
+  type: docker
 
-    # Agent configs
-    random_seed: 42
-    max_steps: 50
-    save_patch: True
-    # Optionally loads a custom system prompt template from a file.
-    # system_prompt: "script/templates/system_prompt.jinja"
-
-    # Shortcut features
-    "show_current_breakpoints": False  # If True, the environment will automatically show the current breakpoints at every step in the system prompt.
-
-froggy_agent:
-    tools:
-        - grep
-        - pdb
-        - view
-        - edit
-        - eval
+agent:
+  type: froggy
+  max_steps: 50
+  system_prompt: >-
+    You are a debugging agent specialized in fixing Python programs.
+    Your goal is to debug a Python program to make sure it can pass a set of hidden test functions.
+    You have access to a problem statement that describes the issue,
+    you also have access to a set of tools to help you investigate the code before proposing a patch.
+    While the code may seem familiar to you from your training, you should not assume you know the code.
+    Instead, you must investigate the code carefully to understand the potential bugs.
+    Once you have gained enough information, propose a patch to fix the bugs.
+    Avoid editing the entire code, focus on the bugs only.
+    You must make tool calls to interact with the environment, but you can only call one tool at a time.
+    Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know.
+    You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking.
+    Once you are done, use the submit tool to end the debugging session, note you can only call it once.
+    Output both your thinking process (if any) and the tool call (must) in the response.
diff --git a/configs/r2egym.yaml b/configs/r2egym.yaml
index 1f9658b5..9ac81fb7 100644
--- a/configs/r2egym.yaml
+++ b/configs/r2egym.yaml
@@ -1,52 +1,40 @@
-base:
-    # Environment configs
-    output_path: "exps/re2gym"
-
-    env:
-        run_timeout: 300
-
-    dataset:
-        type: "r2egym"
-        problems: "all"  # list of problems, e.g., ["aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"], or strings like "d1-valid" (defined in gym/envs/configs), or "all",
-        dataset_id: "R2E-Gym/R2E-Gym-Lite"
-        dataset_revision: "8d3163011f01f9393bb3dc7700497a79a8686ae5"
-
-    terminal:
-        type: "docker"  # "docker", "kubernetes"
-
-    # LLM configs
-    llm_name: "gpt-4o"
-
-    # Agent configs
-    random_seed: 42
-    max_steps: 50
-    save_patch: True
-    # Optionally loads a custom system prompt template from a file.
-    # system_prompt: "script/templates/system_prompt.jinja"
-
-    # Shortcut features
-    "show_current_breakpoints": False  # If True, the environment will automatically show the current breakpoints at every step in the system prompt.
-
-froggy_agent:
-    tools:
-        - grep
-        - view
-        - edit
-        - listdir
-        - eval
-    system_prompt: Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools, you can use them to investigate the code and propose an edit patch to fix the bugs. Avoid editing the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action unless they can provide more information. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. Output both your thinking process (if any) and the tool call (must) in the response.
-    # tools:
-    #     - grep
-    #     - pdb
-    #     - view
-    #     - edit
-    #     - listdir
-    #     - eval
-    # system_prompt: You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) find suspicious files and lines (from error messages or test failures); 2) set breakpoints at suspicious places; 3) continue execution so the frame is at the breakpoint you set; 4) then print necessary values to identify the bugs. Once you have gained enough information, propose an edit patch to fix the bugs. Avoid editing the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the edit tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every edit, it's always a good idea to call the eval tool to execute the new code and check if it passes the tests; if it does not, the tool will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response.
-
-solution_agent:
-    llm_name: "human"  # No need for an LLM.
-    tools:
-        - eval
-        # - pdb
-        - submit
+# Configuration for R2E-Gym benchmark runs.
+task_name: r2egym
+output_path: exps/r2egym
+
+llm_name: gpt-4o
+
+# Tools to load into the environment toolbox.
+tools:
+  - bash
+  - view
+  - edit
+  - submit
+
+task_data:
+  dataset_type: r2egym
+  problems: all  # list of problems, e.g., ["aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"], or strings like "d1-valid", or "all"
+  dataset_id: R2E-Gym/R2E-Gym-Lite
+  dataset_revision: 8d3163011f01f9393bb3dc7700497a79a8686ae5
+  run_timeout: 300
+
+terminal:
+  type: docker
+
+agent:
+  type: froggy
+  max_steps: 50
+  system_prompt: >-
+    You are a debugging agent specialized in fixing Python programs.
+    Your goal is to debug a Python program to make sure it can pass a set of hidden test functions.
+    You have access to a problem statement that describes the issue,
+    you also have access to a set of tools to help you investigate the code before proposing a patch.
+    While the code may seem familiar to you from your training, you should not assume you know the code.
+    Instead, you must investigate the code carefully to understand the potential bugs.
+    Once you have gained enough information, propose a patch to fix the bugs.
+    Avoid editing the entire code, focus on the bugs only.
+    You must make tool calls to interact with the environment, but you can only call one tool at a time.
+    Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know.
+    You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking.
+    Once you are done, use the submit tool to end the debugging session, note you can only call it once.
+    Output both your thinking process (if any) and the tool call (must) in the response.
diff --git a/configs/swebench.yaml b/configs/swebench.yaml
index f0e14a4a..3609c9b2 100644
--- a/configs/swebench.yaml
+++ b/configs/swebench.yaml
@@ -1,60 +1,40 @@
-base:
-    # Environment configs
-    output_path: "exps/swebench-verified"
-
-    env:
-        run_timeout: 300
-
-    dataset:
-        type: "swebench"
-        problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or "all"
-        dataset_id: "SWE-bench/SWE-bench_Verified"
-        dataset_revision: "99450355ca8c611021187a57ffac304b66666738"
-
-    terminal:
-        type: "docker"  # "docker" or "kubernetes"
-
-    # LLM configs
-    llm_name: "gpt-4o"
-
-    # Agent configs
-    random_seed: 42
-    max_steps: 50
-    save_patch: True
-    # Optionally loads a custom system prompt template from a file.
-    # system_prompt: "script/templates/system_prompt.jinja"
-
-    # Shortcut features
-    "show_current_breakpoints": False  # If True, the environment will automatically show the current breakpoints at every step in the system prompt.
-
-
-froggy_agent:
-    tools:
-        - grep
-        - view
-        - edit
-        - listdir
-        - eval
-    system_prompt: Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools, you can use them to investigate the code and propose an edit patch to fix the bugs. Avoid editing the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action unless they can provide more information. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. Output both your thinking process (if any) and the tool call (must) in the response.
-    # tools:
-    #     - grep
-    #     - pdb
-    #     - view
-    #     - edit
-    #     - listdir
-    #     - eval
-    # system_prompt: You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) find suspicious files and lines (from error messages or test failures); 2) set breakpoints at suspicious places; 3) continue execution so the frame is at the breakpoint you set; 4) then print necessary values to identify the bugs. Once you have gained enough information, propose an edit patch to fix the bugs. Avoid editing the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the edit tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every edit, it's always a good idea to call the eval tool to execute the new code and check if it passes the tests; if it does not, the tool will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response.
-
-solution_agent:
-    llm_name: null  # No need for an LLM.
-    tools:
-        - eval
-        - pdb
-        - submit
-
-swe_agent:
-    max_steps: 100
-    tools:
-        - bash
-        - edit
-        - submit
+# Configuration for SWE-bench Verified benchmark runs.
+task_name: swebench
+output_path: exps/swebench-verified
+
+llm_name: gpt-4o
+
+# Tools to load into the environment toolbox.
+tools:
+  - bash
+  - view
+  - edit
+  - submit
+
+task_data:
+  dataset_type: swebench
+  problems: all  # list of problems, e.g., ["astropy__astropy-12907"], or "all"
+  dataset_id: SWE-bench/SWE-bench_Verified
+  dataset_revision: 99450355ca8c611021187a57ffac304b66666738
+  run_timeout: 300
+
+terminal:
+  type: docker
+
+agent:
+  type: froggy
+  max_steps: 50
+  system_prompt: >-
+    You are a debugging agent specialized in fixing Python programs.
+    Your goal is to debug a Python program to make sure it can pass a set of hidden test functions.
+    You have access to a problem statement that describes the issue,
+    you also have access to a set of tools to help you investigate the code before proposing a patch.
+    While the code may seem familiar to you from your training, you should not assume you know the code.
+    Instead, you must investigate the code carefully to understand the potential bugs.
+    Once you have gained enough information, propose a patch to fix the bugs.
+    Avoid editing the entire code, focus on the bugs only.
+    You must make tool calls to interact with the environment, but you can only call one tool at a time.
+    Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know.
+    You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking.
+    Once you are done, use the submit tool to end the debugging session, note you can only call it once.
+    Output both your thinking process (if any) and the tool call (must) in the response.
diff --git a/configs/swebench_debug.yaml b/configs/swebench_debug.yaml
index b4105bd2..a382ad3d 100644
--- a/configs/swebench_debug.yaml
+++ b/configs/swebench_debug.yaml
@@ -1,54 +1,40 @@
-base:
-    # Environment configs
-    output_path: "exps/swebench-verified-debugmode"
-
-    env:
-        run_timeout: 300
-
-    dataset:
-        type: "swebench-debug"
-        problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or "all"
-        dataset_id: "SWE-bench/SWE-bench_Verified"
-        dataset_revision: "99450355ca8c611021187a57ffac304b66666738"
-
-    terminal:
-        type: "docker"  # "docker" or "kubernetes"
-
-    # LLM configs
-    llm_name: "gpt-4o"
-
-    # Agent configs
-    random_seed: 42
-    max_steps: 50
-    save_patch: True
-    # Optionally loads a custom system prompt template from a file.
-    # system_prompt: "script/templates/system_prompt.jinja"
-
-    # Shortcut features
-    "show_current_breakpoints": False  # If True, the environment will automatically show the current breakpoints at every step in the system prompt.
-
-
-froggy_agent:
-    tools:
-        - grep
-        - view
-        - edit
-        - listdir
-        - eval
-    system_prompt: Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools, you can use them to investigate the code and propose an edit patch to fix the bugs. Avoid editing the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action unless they can provide more information. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. Output both your thinking process (if any) and the tool call (must) in the response.
-    # tools:
-    #     - grep
-    #     - pdb
-    #     - view
-    #     - edit
-    #     - listdir
-    #     - eval
-    # system_prompt: You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) find suspicious files and lines (from error messages or test failures); 2) set breakpoints at suspicious places; 3) continue execution so the frame is at the breakpoint you set; 4) then print necessary values to identify the bugs. Once you have gained enough information, propose an edit patch to fix the bugs. Avoid editing the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the edit tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every edit, it's always a good idea to call the eval tool to execute the new code and check if it passes the tests; if it does not, the tool will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response.
-
-
-solution_agent:
-    llm_name: null  # No need for an LLM.
-    tools:
-        - eval
-        - pdb
-        - submit
+# Configuration for SWE-bench Verified benchmark runs (debug mode).
+task_name: swebench-debug
+output_path: exps/swebench-verified-debugmode
+
+llm_name: gpt-4o
+
+# Tools to load into the environment toolbox.
+tools:
+  - bash
+  - view
+  - edit
+  - eval
+
+task_data:
+  dataset_type: swebench-debug
+  problems: all  # list of problems, e.g., ["astropy__astropy-12907"], or "all"
+  dataset_id: SWE-bench/SWE-bench_Verified
+  dataset_revision: 99450355ca8c611021187a57ffac304b66666738
+  run_timeout: 300
+
+terminal:
+  type: docker
+
+agent:
+  type: froggy
+  max_steps: 50
+  system_prompt: >-
+    You are a debugging agent specialized in fixing Python programs.
+    Your goal is to debug a Python program to make sure it can pass a set of hidden test functions.
+    You have access to a problem statement that describes the issue,
+    you also have access to a set of tools to help you investigate the code before proposing a patch.
+    While the code may seem familiar to you from your training, you should not assume you know the code.
+    Instead, you must investigate the code carefully to understand the potential bugs.
+    Once you have gained enough information, propose a patch to fix the bugs.
+    Avoid editing the entire code, focus on the bugs only.
+    You must make tool calls to interact with the environment, but you can only call one tool at a time.
+    Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know.
+    You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking.
+    Once you are done, use the submit tool to end the debugging session, note you can only call it once.
+    Output both your thinking process (if any) and the tool call (must) in the response.
diff --git a/configs/swebench_swe_agent.yaml b/configs/swebench_swe_agent.yaml
new file mode 100644
index 00000000..35336b82
--- /dev/null
+++ b/configs/swebench_swe_agent.yaml
@@ -0,0 +1,48 @@
+# Configuration for SWE-bench using SWE-agent style (bash + edit + submit).
+# This config reproduces the behavior of the removed swe_agent.py.
+task_name: swebench-swe-agent
+output_path: exps/swebench-swe-agent
+
+llm_name: gpt-4o
+
+# SWE-agent style tools: bash for exploration, edit for changes, submit to finish.
+tools:
+  - bash
+  - edit
+  - submit
+
+task_data:
+  dataset_type: swebench
+  problems: all
+  dataset_id: SWE-bench/SWE-bench_Verified
+  dataset_revision: 99450355ca8c611021187a57ffac304b66666738
+  run_timeout: 300
+
+terminal:
+  type: docker
+
+agent:
+  type: froggy
+  max_steps: 100
+  system_prompt: >-
+    You are a debugging agent specialized in fixing Python programs.
+    Your goal is to debug a Python program to make sure it can pass a set of test functions.
+    You have access to a set of tools including the ability to run bash commands to help you investigate the code before proposing a patch.
+    While the code may seem familiar to you from your training, you should not assume you know the code.
+    Instead, you must use the tools available to investigate the code and understand the potential bugs.
+    A common debugging workflow is to
+    1) Analyze the codebase by finding and reading relevant files;
+    2) Create a script to reproduce the issue;
+    3) Edit the source code to resolve the issue;
+    4) Verify your fix works by running your script again;
+    5) Test edge cases to ensure your fix is robust;
+    6) Submit your changes and finish your work by using the submit tool.
+    Avoid editing the entire code, focus on the bugs only.
+    You must make tool calls to interact with the environment, but you can only call one tool at a time.
+    Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know.
+    You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking.
+    If you already had a plan in the previous steps, you can just follow it without repeating the thinking process.
+    If you are confident that you have enough information, propose a patch to fix the bugs by calling the edit tool.
+    If you are not sure, continue using the available tools to gather more information before proposing a patch.
+    After every edit, it's always a good idea to run your reproduction script to execute the new code and check if the problem is resolved.
+    Output both your thinking process (if any) and the tool call (must) in the response.
diff --git a/configs/swesmith.yaml b/configs/swesmith.yaml
index 751eb8f1..305ae620 100644
--- a/configs/swesmith.yaml
+++ b/configs/swesmith.yaml
@@ -1,52 +1,39 @@
-base:
-    # Environment configs
-    output_path: "exps/swesmith"
-
-    env:
-        # type: "swesmith"  # Not needed Will be inferred from dataset.
-        run_timeout: 300
-
-    dataset:
-        type: "swesmith"
-        dataset_id: "SWE-bench/SWE-smith"
-        problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all",
-
-    terminal:
-        type: "docker"
-
-    # LLM configs
-    llm_name: "gpt-4o"
-
-    # Agent configs
-    random_seed: 42
-    max_steps: 50
-    save_patch: True
-    # Optionally loads a custom system prompt template from a file.
-    # system_prompt: "script/templates/system_prompt.jinja"
-
-    # Shortcut features
-    "show_current_breakpoints": False  # If True, the environment will automatically show the current breakpoints at every step in the system prompt.
-
-froggy_agent:
-    tools:
-        - grep
-        - view
-        - edit
-        - listdir
-        - eval
-    system_prompt: Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools, you can use them to investigate the code and propose an edit patch to fix the bugs. Avoid editing the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action unless they can provide more information. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. Output both your thinking process (if any) and the tool call (must) in the response.
-    # tools:
-    #     - grep
-    #     - pdb
-    #     - view
-    #     - edit
-    #     - listdir
-    #     - eval
-    # system_prompt: You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the pdb debugger to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the pdb debugger to investigate the code and understand the potential bugs. A common debugging workflow is to 1) find suspicious files and lines (from error messages or test failures); 2) set breakpoints at suspicious places; 3) continue execution so the frame is at the breakpoint you set; 4) then print necessary values to identify the bugs. Once you have gained enough information, propose an edit patch to fix the bugs. Avoid editing the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the edit tool. If you are not sure, continue using the pdb tool to gather more information before proposing a patch. After every edit, it's always a good idea to call the eval tool to execute the new code and check if it passes the tests; if it does not, the tool will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response.
-
-solution_agent:
-    llm_name: null  # No need for an LLM.
-    tools:
-        - eval
-        # - pdb
-        - submit
+# Configuration for SWE-smith benchmark runs.
+task_name: swesmith
+output_path: exps/swesmith
+
+llm_name: gpt-4o
+
+# Tools to load into the environment toolbox.
+tools:
+  - bash
+  - view
+  - edit
+  - submit
+
+task_data:
+  dataset_type: swesmith
+  problems: all  # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125", or "all"
+  dataset_id: SWE-bench/SWE-smith
+  run_timeout: 300
+
+terminal:
+  type: docker
+
+agent:
+  type: froggy
+  max_steps: 50
+  system_prompt: >-
+    You are a debugging agent specialized in fixing Python programs.
+    Your goal is to debug a Python program to make sure it can pass a set of hidden test functions.
+    You have access to a problem statement that describes the issue,
+    you also have access to a set of tools to help you investigate the code before proposing a patch.
+    While the code may seem familiar to you from your training, you should not assume you know the code.
+    Instead, you must investigate the code carefully to understand the potential bugs.
+    Once you have gained enough information, propose a patch to fix the bugs.
+    Avoid editing the entire code, focus on the bugs only.
+    You must make tool calls to interact with the environment, but you can only call one tool at a time.
+    Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know.
+    You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking.
+    Once you are done, use the submit tool to end the debugging session, note you can only call it once.
+    Output both your thinking process (if any) and the tool call (must) in the response.
diff --git a/debug_gym/agents/__init__.py b/debug_gym/agents/__init__.py
index 3c51e0a4..caf48ec6 100644
--- a/debug_gym/agents/__init__.py
+++ b/debug_gym/agents/__init__.py
@@ -1,12 +1,10 @@
 from debug_gym.agents.base_agent import BaseAgent, register_agent
 from debug_gym.agents.froggy_agent import FroggyAgent
 from debug_gym.agents.solution_agent import AgentSolution
-from debug_gym.agents.swe_agent import SWEAgent
 
 __all__ = [
     "BaseAgent",
     "register_agent",
     "FroggyAgent",
     "AgentSolution",
-    "SWEAgent",
 ]
diff --git a/debug_gym/agents/swe_agent.py b/debug_gym/agents/swe_agent.py
deleted file mode 100644
index 3bfc4d60..00000000
--- a/debug_gym/agents/swe_agent.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from debug_gym.agents.base_agent import BaseAgent, register_agent
-
-
-@register_agent
-class SWEAgent(BaseAgent):
-    name = "swe_agent"
-    system_prompt = "You are a debugging agent specialized in fixing Python programs. Your goal is to debug a Python program to make sure it can pass a set of test functions. You have access to a set of tools including the ability to run bash commands to help you investigate the code before proposing a patch. While the code may seem familiar to you from your training, you should not assume you know the code. Instead, you must use the tools available to investigate the code and understand the potential bugs. A common debugging workflow is to 1) Analyze the codebase by finding and reading relevant files; 2) Create a script to reproduce the issue; 3) Edit the source code to resolve the issue; 4) Verify your fix works by running your script again; 5) Test edge cases to ensure your fix is robust; 6) Submit your changes and finish your work by using the submit tool. Avoid editing the entire code, focus on the bugs only. You must make tool calls to interact with the environment, but you can only call one tool at a time. Do not repeat your previous action, especially if it returned tool calling errors or it resulted in information that you already know. You can spend some time thinking to help you make the decision when you are stuck, but you must be concise and avoid overthinking. If you already had a plan in the previous steps, you can just follow it without repeating the thinking process. If you are confident that you have enough information, propose a patch to fix the bugs by calling the edit tool. If you are not sure, continue using the available tools to gather more information before proposing a patch. After every edit, it's always a good idea to run your reproduction script to execute the new code and check if the problem is resolved; if it is not, the script will return the error messages, which you can use to continue debugging. Output both your thinking process (if any) and the tool call (must) in the response."

From 8143dfade6bfe45fb8999bce712a0f4374551606 Mon Sep 17 00:00:00 2001
From: "Xingdi (Eric) Yuan" <xingdi-eric-yuan@users.noreply.github.com>
Date: Wed, 10 Dec 2025 20:52:20 -0500
Subject: [PATCH 12/12] update readme

---
 README.md | 95 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 72 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 12c4cda0..607bf557 100644
--- a/README.md
+++ b/README.md
@@ -80,12 +80,14 @@ One of the core designs of `debug-gym` is the notion of tools. Users can dynamic
 
 | Tool name | Description |
 | :-: | :----- |
+| `bash` | Run commands in a bash shell. You have access to common Linux and Python packages via pip. State is persistent across command calls within the same session. |
 | `listdir` | It returns the directory tree at a given subdirectory. This is particularly useful when dealing with a repository with multiple files. |
 | `view` | It is used to change an agent's focus to a particular source code file. This is particularly useful when dealing with a repository with multiple files. |
 | `eval` | It runs the current code repository using the provided entrypoint (e.g., pytest), and returns the terminal's output (e.g., error message). |
-| `pdb` | Interactive debugger wrapping the [Python pdb tool](https://docs.python.org/3/library/pdb.html). In additon, users can choose to maintain a set of persistent breakpoints (as in some programming IDEs), which are not reset after every eval. With such feature, a new pdb debugging session is activated automatically, with all the breakpoints restored. Note such breakpoint can be cleared by pdb commands such as `cl`. |
+| `pdb` | Interactive debugger wrapping the [Python pdb tool](https://docs.python.org/3/library/pdb.html). In addition, users can choose to maintain a set of persistent breakpoints (as in some programming IDEs), which are not reset after every eval. With such feature, a new pdb debugging session is activated automatically, with all the breakpoints restored. Note such breakpoints can be cleared by pdb commands such as `cl`. |
 | `grep` | Search for patterns in files within the repository. Supports both literal string matching and regular expressions. Can search in specific files, directories, or the entire repository. Useful for finding code patterns, function definitions, variable usage, or identifying files containing specific text. |
 | `edit` | It can be used to edit a certain piece of code to fix the bug. The inputs of this tool call include the file path, the start and end line numbers, and the new code. |
+| `submit` | Submit your changes once the task is complete. By default, it runs evaluation before terminating the session, but this can be disabled via `eval_on_submit: false`. |
 
 Upon importing a tool, its action space and observation space will be automatically merged into `debug-gym`'s action space and observation space; its instruction will also be merged into the overall instruction provided to the agent (e.g., as system prompt).
 
@@ -99,9 +101,8 @@ We provide the below LLM-based agents, they all have minimal design and serve th
 
 | Agent name | Available Tools | Description |
 | :-: | :-: | :----- |
-| `froggy_agent` | `grep`, `pdb`, `view`, `edit`, `eval` (configurable) | Primary debugging agent. Adjust prompts and tool lists in YAML to mimic edit-only, grep-heavy, or other workflows. |
-| `solution_agent` | `pdb`, `eval`  | An oracle agent that applies a gold patch (only works with `swebench` and `swesmith` benchmarks for now). The agent checks that tests are failing before applying the patch, and passing after. It also checks that `pdb` tool can be used as expected. |
-| `swe_agent` | `bash`, `edit`, `submit` | Baseline agent tailored for the SWE-bench setting that executes bash commands in addition to edits. |
+| `froggy_agent` | `bash`, `view`, `edit`, `submit` (configurable) | Primary debugging agent. Adjust prompts and tool lists in YAML to customize workflows. |
+| `solution_agent` | `pdb`, `eval`  | An oracle agent that applies a gold patch (works with `swebench`, `swesmith`, and `r2egym` benchmarks). The agent checks that tests are failing before applying the patch, and passing after. It also checks that `pdb` tool can be used as expected (if available). |
 
 ---
 
@@ -118,7 +119,7 @@ To demonstrate how to integrate `debug-gym` with coding tasks and repositories,
 | `mini_nightmare` | A set of 10 hand-crafted minimal buggy code snippet where edit-only agents have harder time to tackle. Read details [here](https://github.com/microsoft/debug-gym/blob/main/data/mini_nightmare/mini_nightmare.md). |
 
 > [!NOTE]
-> Since debug-gym focuses on debugging task with the use of a debugger, we provide a customized version of `swebench`, called `swebench-debug`, where each problem's codebase already has the gold test patch applied. This allows us to better simulate real-world debugging scenarios where the buggy code is expected to have failing tests and we can set the debugger's entrypoint accordingly. To use `swebench-debug`, set `benchmark: "swebench-debug"` in your config file (see [Running Baselines](#3-running-baselines)).
+> Since debug-gym focuses on debugging tasks with the use of a debugger, we provide a customized version of `swebench`, called `swebench-debug`, where each problem's codebase already has the gold test patch applied. This allows us to better simulate real-world debugging scenarios where the buggy code is expected to have failing tests and we can set the debugger's entrypoint accordingly. To use `swebench-debug`, use `configs/swebench_debug.yaml` or set `task_data.dataset_type: swebench-debug` in your config file.
 
 ---
 
@@ -148,9 +149,9 @@ Terminal selection is configured through the `terminal_config` in your script co
 ---
 
 ## 3. Running Baselines
-We use `.yaml` files to specify configurations. Example config files can be found in `scripts/`. To run an agent:
+We use `.yaml` files to specify configurations. Example config files can be found in `configs/`. To run an agent:
 
-    python scripts/run.py scripts/config_<benchmark name>.yaml --agent <agent name>
+    python scripts/run.py configs/<benchmark name>.yaml
 
 Add `-v`, `--debug` to be verbose, or to enter debug mode.
 > [!WARNING]
@@ -158,24 +159,24 @@ Add `-v`, `--debug` to be verbose, or to enter debug mode.
 
 #### 3.1 Sanity Checks
 
-We can use the `solution_agent` to validate that your `swebench` and `swesmith` instances work as expected. This agent will apply a gold patch to the buggy code and check that the tests are failing before applying the patch, and passing after. It also checks that `pdb` tool can be used as expected.
+We can use the `solution_agent` to validate that your `swebench`, `swesmith`, and `r2egym` instances work as expected. This agent will apply a gold patch to the buggy code and check that the tests are failing before applying the patch, and passing after. It also checks that `pdb` tool can be used as expected (if available).
 
-    python scripts/run.py scripts/config_swebench.yaml --agent solution_agent
-    python scripts/run.py scripts/config_swesmith.yaml --agent solution_agent
+    python scripts/run.py configs/swebench.yaml -p agent.type=solution_agent
+    python scripts/run.py configs/swesmith.yaml -p agent.type=solution_agent
+    python scripts/run.py configs/r2egym.yaml -p agent.type=solution_agent
 
 #### 3.2 Human Mode
 
-We provide a human mode that enables developers to manually interact with `debug-gym`. To activate this mode, change the `llm_name` field in the `config_*.yaml` to be `"human"`. Once activated, at every step, the environment will expect a command input (in tool calling format). One can use the `Tab` key to get a list of tool calling templates and fill in any necessary arguments.
+We provide a human mode that enables developers to manually interact with `debug-gym`. To activate this mode, change the `llm_name` field in your config YAML to `"human"`. Once activated, at every step, the environment will expect a command input (in tool calling format). One can use the `Tab` key to get a list of tool calling templates and fill in any necessary arguments.
 
 #### 3.3. Overriding Values in Config
 
-The `-p` flag is a handy way to override values defined in the config file. For example, the command below will run the `froggy_agent` configuration on Aider with human mode (even if the config file specifies gpt-4o). The command also overrides the default system prompt (see below for more information).
+The `-p` flag is a handy way to override values defined in the config file. For example, the command below will run on Aider with human mode (even if the config file specifies gpt-4o). The command also overrides the default system prompt (see below for more information).
 
-    python scripts/run.py scripts/config_aider.yaml \
-        --agent froggy_agent \
+    python scripts/run.py configs/aider.yaml \
         -v \
-        -p froggy_agent.llm_name="human" \
-        -p froggy_agent.system_prompt="scripts/templates/human_friendly_system_prompt.jinja"
+        -p llm_name="human" \
+        -p agent.system_prompt="scripts/templates/human_friendly_system_prompt.jinja"
 
 
 #### 3.4. Customizing the System Prompt with Jinja Templates
@@ -185,11 +186,12 @@ The `-p` flag is a handy way to override values defined in the config file. For
 To use a custom system prompt template, specify the path to your Jinja template file in your agent's configuration under `system_prompt`. For example:
 
 ```yaml
-froggy_agent:
-    system_prompt: scripts/templates/custom_system_prompt.jinja
+agent:
+  type: froggy
+  system_prompt: scripts/templates/custom_system_prompt.jinja
 ```
 
-Alternatively, you can provide a custom template from the command line with `-p <agent>.system_prompt="<path/to/template.jinja>"` (see above).
+Alternatively, you can provide a custom template from the command line with `-p agent.system_prompt="<path/to/template.jinja>"` (see above).
 
 Within your Jinja template, you have access to the `agent` and `info` objects, which provide all relevant context about the current environment and agent state.
 
@@ -233,17 +235,17 @@ Shortcut Features:
 
 #### 3.5. Debugging a Custom Repository
 
-Modify `scripts/config.yaml`, especially the `env_kwargs` to set the path and entrypoint of the custom repository. We assume there is a `.debugignore` file and a `.debugreadonly` within the repository that labels files/folders that are not seen or not editable, respectively.
+Modify `configs/config.yaml`, especially the `task_data` section to set the path and entrypoint of the custom repository. We assume there is a `.debugignore` file and a `.debugreadonly` within the repository that labels files/folders that are not seen or not editable, respectively.
 
 As an example, we provide a buggy pytorch code repository in `data/pytorch`.
 
-    python scripts/run.py scripts/config.yaml --agent <agent name>
+    python scripts/run.py configs/config.yaml
 
 #### 3.6. Debugging a Custom SWE-Smith Instance
 
-[SWE-Smith](https://github.com/SWE-bench/SWE-smith) allows to generate new buggy code instances. Give a custom HuggingFace dataset (either local or remote) that has a similar structure as [SWE-bench/SWE-smith](https://huggingface.co/datasets/SWE-bench/SWE-smith), one can override the `-p base.env_kwargs.dataset_id=<dataset_id>` in the command line to run the agent on that dataset. For example, to run on a local dataset:
+[SWE-Smith](https://github.com/SWE-bench/SWE-smith) allows to generate new buggy code instances. Given a custom HuggingFace dataset (either local or remote) that has a similar structure as [SWE-bench/SWE-smith](https://huggingface.co/datasets/SWE-bench/SWE-smith), one can override the `-p task_data.dataset_id=<dataset_id>` in the command line to run the agent on that dataset. For example, to run on a local dataset:
 
-    python scripts/run.py scripts/config_swesmith.yaml --agent <agent name> -p base.env_kwargs.dataset_id="path/to/local/dataset"
+    python scripts/run.py configs/swesmith.yaml -p task_data.dataset_id="path/to/local/dataset"
 
 #### 3.7. Design Your Own Tool
 `debug-gym`'s modular design makes it extensible. Users are encouraged to extend `debug-gym` to their specific usecases, for example by creating new tools that diversify an agent's action and observation spaces. For detailed instruction on designing new tools that are `debug-gym`-compatible, please refer to the [Technical Report](https://arxiv.org/abs/2503.21557).
@@ -254,6 +256,53 @@ We provide a set of scripts to help analyze the log files (e.g., the `.jsonl` fi
 - In the `analysis` folder, we provide scripts that used to generate the corresponding figures in our technical report.
 - In the `analysis/json_log_viewer` folder, we provide a Flask app to view a `.jsonl` log file in the browser.
 
+#### 3.9. FreeEnv: Open-Ended Agent Development
+
+While `debug-gym` was designed for debugging tasks, the `FreeEnv` environment enables open-ended agent development beyond SWE-bench-style debugging. Use `FreeEnv` to build and test general-purpose coding agents that can perform any task you define—code exploration, refactoring, feature implementation, or custom workflows.
+
+**Key features:**
+- **Custom Docker image**: Specify any Docker image as the execution environment
+- **Flexible tool configuration**: Mix and match tools (`bash`, `edit`, `pdb`, `view`, `grep`, etc.) as needed
+- **Custom system prompts**: Define your agent's behavior and goals
+- **No predefined test harness**: The `submit` tool simply ends the session without running evaluations (configurable via `eval_on_submit`)
+
+**Example configuration** (`configs/free_env.yaml`):
+
+```yaml
+task_name: free-session
+output_path: exps/free_env
+
+llm_name: gpt-4o
+
+tools:
+  - edit
+  - bash
+  - submit:
+      eval_on_submit: false
+
+task_data:
+  env_type: FreeEnv
+  image: ubuntu:22.04
+  local_path: /path/to/your/codebase
+  workspace_dir: /testbed
+
+terminal:
+  type: docker
+
+agent:
+  type: froggy
+  max_steps: 50
+  system_prompt: >-
+    You are a coding assistant. Use the available tools to explore and modify the codebase.
+    When you are done, use the submit tool to end the session.
+```
+
+Run with:
+
+    python scripts/run.py configs/free_env.yaml
+
+This provides a sandbox for developing and evaluating coding agents on arbitrary tasks, making `debug-gym` useful for general agent research beyond debugging.
+
 ## Citation
 ```
 @article{yuan2025debuggym,