microsoft · MarcCote · Dec 4, 2025 · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025
diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml
@@ -39,11 +39,13 @@ runs:
         else
           pip install "debug-gym[dev]==${{ inputs.version }}"
         fi
+        df -h
     - name: Run tests
       env:
         DEBUG_GYM_DEBUG: 1
       shell: bash
       run: |
+        free -h
         pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing
     - name: Store coverage report
       uses: actions/upload-artifact@v4

diff --git a/debug_gym/agents/base_agent.py b/debug_gym/agents/base_agent.py
@@ -371,5 +371,5 @@ def create_agent(
     if agent_args is None:
         raise ValueError("Either agent_args or config must be provided.")
 
-    agent = agent_class(args=agent_args, **agent_kwargs)
+    agent = agent_class(agent_args=agent_args, **agent_kwargs)
     return agent
diff --git a/debug_gym/agents/solution_agent.py b/debug_gym/agents/solution_agent.py
@@ -39,28 +39,36 @@ def run(self, env, debug=False):
                 return True
 
             self.logger.info(f"Score: {info.score}/{info.max_score or '-'}")
-            # Make a simple pdb call to make sure it is working.
-            action = ToolCall(name="pdb", id="pdb", arguments={"command": "help help"})
-            pdb_help_info = self.env.step(action, None, None)
-            assert "h(elp)" in pdb_help_info.step_observation.observation, (
-                "PDB command did not return expected help message.\n"
-                f"{pdb_help_info.step_observation.observation}"
-            )
 
-            # Send a pdb continue command, and check the output matches the one from env.reset.
-            action = ToolCall(name="pdb", id="pdb", arguments={"command": "continue"})
-            pdb_continue_info = self.env.step(action, None, None)
-
-            assert (
-                "Reached the end of the program. Restarting the debugging session."
-                in pdb_continue_info.step_observation.observation
-            ) or (
-                info.step_observation.observation.splitlines()[-1]
-                in pdb_continue_info.step_observation.observation
-            ), (
-                "PDB command did not return expected continue message.\n"
-                f"{pdb_continue_info.step_observation.observation}"
-            )
+            if env.has_tool("pdb"):
+                # Make a simple pdb call to make sure it is working.
+                action = ToolCall(
+                    name="pdb", id="pdb", arguments={"command": "help help"}
+                )
+                pdb_help_info = self.env.step(action, None, None)
+                assert "h(elp)" in pdb_help_info.step_observation.observation, (
+                    "PDB command did not return expected help message.\n"
+                    f"{pdb_help_info.step_observation.observation}"
+                )
+
+                # Send a pdb continue command, and check the output matches the one from env.reset.
+                action = ToolCall(
+                    name="pdb", id="pdb", arguments={"command": "continue"}
+                )
+                pdb_continue_info = self.env.step(action, None, None)
+
+                pdb_observation = pdb_continue_info.step_observation.observation
+                expected_messages = [
+                    "Reached the end of the program. Restarting the debugging session.",
+                    "Uncaught exception. Entering post mortem debugging",
+                ]
+                reset_observation = info.step_observation.observation
+                if reset_observation.splitlines():
+                    expected_messages.append(reset_observation.splitlines()[-1])
+
+                assert any(
+                    msg in pdb_observation for msg in expected_messages
+                ), f"PDB command did not return expected continue message.\n{pdb_observation}"
 
             self.env.apply_gold_patch()
 

diff --git a/debug_gym/agents/utils.py b/debug_gym/agents/utils.py
@@ -108,15 +108,6 @@ def load_config():
     with open(args.config_file) as reader:
         config = yaml.safe_load(reader)
 
-    # Parse overriden params.
-    for param in args.params:
-        fqn_key, value = param.split("=")
-        entry_to_change = config
-        keys = fqn_key.split(".")
-        for k in keys[:-1]:
-            entry_to_change = entry_to_change[k]
-        entry_to_change[keys[-1]] = yaml.safe_load(value)
-
     available_agents = [item for item in list(config.keys()) if item != "base"]
 
     if not args.agent:
@@ -130,14 +121,25 @@ def load_config():
     if "base" in config:
         # base config is specified (shared across agents)
         return_config = config["base"]
-        agent_specific_config = config[args.agent]
-        for key in agent_specific_config:
-            # override base config with agent specific config
-            return_config[key] = agent_specific_config[key]
+        # Override base config with agent specific config
+        for key, value in config[args.agent].items():
+            return_config[key] = value
     else:
         # base config is not specified
         return_config = config[args.agent]
 
+    # Parse overriden params.
+    for param in args.params:
+        fqn_key, value = param.split("=")
+        entry_to_change = return_config
+        keys = fqn_key.split(".")
+        for k in keys[:-1]:
+            if k not in entry_to_change:
+                entry_to_change[k] = {}
+
+            entry_to_change = entry_to_change[k]
+        entry_to_change[keys[-1]] = yaml.safe_load(value)
+
     # assume agent type is the key if not specified by the user
     if not return_config.get("agent_type"):
         return_config["agent_type"] = args.agent

diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py
@@ -1,16 +1,18 @@
 from debug_gym.gym.envs.aider import AiderBenchmarkEnv
 from debug_gym.gym.envs.env import RepoEnv, TooledEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
 from debug_gym.gym.envs.r2egym import R2EGymEnv
 from debug_gym.gym.envs.swe_bench import SWEBenchEnv
 from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv
 from debug_gym.gym.envs.swe_smith import SWESmithEnv
+from debug_gym.logger import DebugGymLogger
 
 
 def select_env(env_type: str = None) -> type[RepoEnv]:
     match env_type:
-        case None:
-            return RepoEnv
+        case "local":
+            return LocalEnv
         case "aider":
             return AiderBenchmarkEnv
         case "swebench":
@@ -24,4 +26,20 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
         case "r2egym":
             return R2EGymEnv
         case _:
-            raise ValueError(f"Unknown benchmark {env_type}")
+            raise ValueError(f"Unknown environment {env_type}")
+
+
+def load_dataset(config: dict, logger: DebugGymLogger | None = None) -> dict:
+    """Load dataset based on the given config."""
+    if config.get("type") is None:
+        raise ValueError("Dataset config must specify 'type' field.")
+
+    try:
+        env = select_env(config.get("type"))
+    except ValueError as e:
+        raise ValueError(
+            f"Unknown environment type '{config.get('type')}' from dataset's config: {config}"
+        )
+
+    dataset = env.load_dataset(logger=logger, **config)
+    return dataset
diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import subprocess
 import tempfile
@@ -7,16 +8,20 @@
 from debug_gym.constants import DEBUG_GYM_CACHE_DIR
 from debug_gym.gym.entities import EvalOutput
 from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.logger import DebugGymLogger
 
 DOCKER_AIDER_IMAGE_NAME = "debug-gym:aider"
 
 
-def build_docker_image(logger):
+def build_docker_image(logger: logging.Logger | None = None):
     """
     Build a Docker image for the Mini Nightmare environment.
     """
+    logger = logger or DebugGymLogger("debug-gym")
+
     # Check if Docker image is built.
     import docker
 
@@ -62,6 +67,7 @@ class AiderBenchmarkEnv(RepoEnv):
 
     def __init__(
         self,
+        task_data: dict,
         entrypoint: str = "python -m pytest --tb=no -s .",
         terminal: Terminal | None = None,
         **kwargs,
@@ -73,7 +79,13 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_AIDER_IMAGE_NAME
 
-        super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
+        super().__init__(
+            task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
+        )
+
+    @property
+    def task_name(self) -> str:
+        return self.current_task["task_name"]
 
     @property
     def instructions(self) -> str:
@@ -91,10 +103,8 @@ def eval(self, **kwargs) -> EvalOutput:
         self.last_eval = EvalOutput(success, output)
         return self.last_eval
 
-    def setup_task(self, task_name: str, options: dict = None):
-        if task_name not in self.dataset:
-            raise ValueError(f"Task {task_name} not found in the dataset.")
-        self.current_task = self.dataset[task_name]
+    def setup_task(self):
+        self.current_task = self.task_data
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -122,14 +132,21 @@ def setup_terminal(self):
         )  # Aider tasks come with those.
         self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
 
-    def load_dataset(self, problems: str | list[str] | None = None):
-        if isinstance(self.terminal, DockerTerminal):
-            build_docker_image(self.logger)
+    @classmethod
+    def load_dataset(
+        cls,
+        problems: str | list[str] | None = None,
+        build_image: bool = True,
+        logger: object = None,
+        **kwargs,
+    ) -> dict:
+        if build_image:
+            build_docker_image(logger)
 
-        if not os.path.exists(self.REPO_PATH):
-            subprocess.run(["git", "clone", self.REPO_URL, self.REPO_PATH], check=True)
+        if not os.path.exists(cls.REPO_PATH):
+            subprocess.run(["git", "clone", cls.REPO_URL, cls.REPO_PATH], check=True)
 
-        practice_path = self.REPO_PATH / "exercises" / "practice"
+        practice_path = cls.REPO_PATH / "exercises" / "practice"
         directories = [d for d in practice_path.iterdir() if d.is_dir()]
 
         dataset = {}
@@ -160,11 +177,17 @@ def load_dataset(self, problems: str | list[str] | None = None):
             )
 
             dataset[task_name] = {
+                "task_name": task_name,
                 "codebase": directory,
                 "instructions": instructions,
                 "filename": task_name + ".py",
             }
 
         problems = utils.filter_problems(dataset, problems)
-        dataset = {id: i for id, i in dataset.items() if id in problems}
+        dataset = {id: data for id, data in dataset.items() if id in problems}
+
+        # Add env_type to each task_data.
+        for task_data in dataset.values():
+            task_data["env_type"] = "aider"
+
         return dataset
diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py
@@ -201,41 +201,29 @@ class RepoEnv(TooledEnv):
 
     def __init__(
         self,
-        path: str | None = None,
+        task_data: dict,
         entrypoint: str = "python -m pytest -sq .",
         debug_entrypoint: str | None = None,
         max_score: int | None = None,
-        readonly_patterns: list[str] | None = None,  # TODO: remove
         run_timeout: int | None = None,
         terminal: Terminal | None = None,
         logger: DebugGymLogger | None = None,
-        problems: str | list[str] | None = None,
         **kwargs,
     ):
         super().__init__()
 
-        self.path = path
+        self.task_data = task_data
         self.max_score = max_score
         self.run_timeout = run_timeout
-        self.terminal = terminal or LocalTerminal()  # TODO: default to DockerTerminal
+        self.terminal = terminal
         self._entrypoint = entrypoint
         self._debug_entrypoint = debug_entrypoint
         self.logger = logger or DebugGymLogger("debug-gym")
         self.infos: EnvInfo | None = None
         self.rng = None
         self.additional_kwargs = kwargs
-        self.task_name: str | None = None
-        self.options: dict = {}
-
-        if "auto_eval_on_rewrite" in kwargs:
-            raise ValueError(
-                "The 'auto_eval_on_rewrite' parameter is no longer supported. "
-                "Please remove it from your initialization arguments."
-                "Instead, set 'auto_eval_on_rewrite' in the EvalTool instance."
-            )
 
         self.workspace = Workspace(self.terminal, logger=self.logger)
-        self.dataset = self.load_dataset(problems)
         self.set_entrypoints(self._entrypoint, self._debug_entrypoint)
 
     def _reset_env_state(self):
@@ -290,45 +278,39 @@ def working_dir(self) -> Path:
     def instructions(self) -> str:
         """Instructions for the current task.
         Override in subclasses for different behavior."""
-        return ""
+        raise NotImplementedError(
+            "Subclasses must implement the instructions property."
+        )
 
-    def setup_task(self, task_name: str, options: dict = None) -> None:
+    @property
+    def task_name(self) -> str:
+        raise NotImplementedError("Subclasses must implement the task_name property.")
+
+    def setup_task(self) -> None:
         """Setup the task information.
         Override in subclasses for different behavior. Called once at reset."""
-        pass
+        raise NotImplementedError("Subclasses must implement setup_task method.")
 
     def setup_workspace(self) -> None:
         """Setup the workspace.
         Override in subclasses for different behavior. Called once at reset."""
-        self.workspace.reset()
-        self.workspace.copy_content(self.path)
-        self.workspace.setup_file_filters()
+        raise NotImplementedError("Subclasses must implement setup_workspace method.")
 
     def setup_terminal(self) -> None:
         """Setup the terminal.
         Override in subclasses for different behavior. Called once at reset."""
-
-        self.logger.debug(f"Configuring {self.terminal}...")
-
-        self.terminal.run("git init -b main")
-        self.terminal.run("git config user.name 'debug-gym'")
-        self.terminal.run("git config user.email '<>'")
-
-        self.terminal.run("git add *")
-        self.terminal.run("git commit -am 'Init'")
-
-        self.terminal.run("git add .debugignore .debugreadonly")
-        self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
+        raise NotImplementedError("Subclasses must implement setup_terminal method.")
 
     def reset(self, *, options: dict = None):
         """Resets the environment and returns eval as the initial observation."""
-        self.options = options if options is not None else self.options
+        options = options if options is not None else {}
         self.logger.debug("Resetting environment")
-        self.close()  # Clean up previous workspace and terminal.
-        self.task_name = self.options.get("task_name")
-        self.setup_task(task_name=self.task_name, options=self.options)
-        self.setup_workspace()
-        self.setup_terminal()
+        if options.get("reset_runtime", True):
+            self.close()  # Clean up previous workspace and terminal.
+            self.setup_task()
+            self.setup_workspace()
+            self.setup_terminal()
+
         self._reset_env_state()
 
         # Notify all tools that the environment is reset and get their observations
@@ -504,6 +486,3 @@ def close(self):
 
     def __del__(self):
         self.close()
-
-    def load_dataset(self, problems: str | list[str] | None = None):
-        return {"custom": None}