microsoft
diff --git a/‎debug_gym/gym/envs/aider.py‎
Lines changed: 11 additions & 5 deletions b/‎debug_gym/gym/envs/aider.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎debug_gym/gym/envs/local.py‎
Lines changed: 6 additions & 1 deletion b/‎debug_gym/gym/envs/local.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎debug_gym/gym/envs/mini_nightmare.py‎
Lines changed: 10 additions & 6 deletions b/‎debug_gym/gym/envs/mini_nightmare.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎tests/gym/envs/test_aider.py‎
Lines changed: 8 additions & 4 deletions b/‎tests/gym/envs/test_aider.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎tests/gym/envs/test_env.py‎
Lines changed: 22 additions & 24 deletions b/‎tests/gym/envs/test_env.py‎
Lines changed: 22 additions & 24 deletions
diff --git a/‎tests/gym/envs/test_mini_nightmare.py‎
Lines changed: 10 additions & 8 deletions b/‎tests/gym/envs/test_mini_nightmare.py‎
Lines changed: 10 additions & 8 deletions
@@ -2,12 +2,12 @@
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import List
 
 import debug_gym.gym.utils as utils
 from debug_gym.constants import DEBUG_GYM_CACHE_DIR
 from debug_gym.gym.entities import EvalOutput
 from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.terminals.terminal import Terminal
 
@@ -75,8 +75,13 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_AIDER_IMAGE_NAME
 
-        self.task_data = task_data
-        super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
+        super().__init__(
+            task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
+        )
+
+    @property
+    def task_name(self) -> str:
+        return self.current_task["task_name"]
 
     @property
     def instructions(self) -> str:
@@ -95,7 +100,7 @@ def eval(self, **kwargs) -> EvalOutput:
         return self.last_eval
 
     def setup_task(self):
-        pass
+        self.current_task = self.task_data
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -127,7 +132,7 @@ def setup_terminal(self):
     def load_dataset(
         cls,
         problems: str | list[str] | None = None,
-        build_image: bool = False,
+        build_image: bool = True,
         logger: object = None,
     ) -> dict:
         if build_image:
@@ -167,6 +172,7 @@ def load_dataset(
             )
 
             dataset[task_name] = {
+                "task_name": task_name,
                 "codebase": directory,
                 "instructions": instructions,
                 "filename": task_name + ".py",
 
@@ -1,25 +1,30 @@
 from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.terminals.local import LocalTerminal
+from debug_gym.gym.terminals.terminal import Terminal
 
 
 class LocalEnv(RepoEnv):
 
     def __init__(
         self,
         path: str,
+        terminal: Terminal | None = None,
         entrypoint: str = "python -m pytest -sq .",
         debug_entrypoint: str | None = None,
         **kwargs,
     ):
         task_data = {"path": path}
+        terminal = terminal or LocalTerminal()
         super().__init__(
             task_data=task_data,
+            terminal=terminal,
             entrypoint=entrypoint,
             debug_entrypoint=debug_entrypoint,
             **kwargs,
         )
 
     @property
-    def instruction(self) -> str:
+    def instructions(self) -> str:
         return f"Debug the local codebase at {self.path}. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue."
 
     @property
 
@@ -86,10 +86,9 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_MINI_NIGHTMARE_IMAGE_NAME
 
-        self.task_data = task_data
-        self.task_name = task_data["task_name"]
-
-        super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
+        super().__init__(
+            task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
+        )
 
     @property
     def instructions(self) -> str:
@@ -99,6 +98,10 @@ def instructions(self) -> str:
             " Beaware that the bug may not be in the code you initially see."
         )
 
+    @property
+    def task_name(self) -> str:
+        return self.current_task["task_name"]
+
     def calculate_max_score(self, eval_output: EvalOutput) -> int:
         return utils.extract_max_score_from_pytest_output(eval_output.output)
 
@@ -112,7 +115,7 @@ def eval(self, **kwargs) -> EvalOutput:
         return self.last_eval
 
     def setup_task(self):
-        pass
+        self.current_task = self.task_data
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -144,7 +147,7 @@ def setup_terminal(self):
     def load_dataset(
         cls,
         problems: str | list[str] | None = None,
-        build_image: bool = False,
+        build_image: bool = True,
         logger: object = None,
     ) -> dict:
         if build_image:
@@ -167,6 +170,7 @@ def load_dataset(
             assert (task_path / ".debugreadonly").exists()
 
             dataset[task_name] = {
+                "task_name": task_name,
                 "codebase": task_path,
                 "filename": task_name + "_code.py",
             }
 
@@ -37,8 +37,10 @@ def setup_aider_repo(tmp_path_factory):
 @pytest.fixture
 def env(setup_aider_repo):
     terminal = LocalTerminal()
-    env = AiderBenchmarkEnv(terminal=terminal)
-    env.reset(options={"task_name": "clock"})
+    dataset = AiderBenchmarkEnv.load_dataset()
+    task_data = dataset["clock"]
+    env = AiderBenchmarkEnv(task_data=task_data, terminal=terminal)
+    env.reset()
     return env
 
 
@@ -103,13 +105,15 @@ def test_instructions(env):
 
 @patch("debug_gym.gym.envs.aider.build_docker_image")
 def test_build_docker_image(mock_build_docker_image):
-    AiderBenchmarkEnv()
+    dataset = AiderBenchmarkEnv.load_dataset()
     mock_build_docker_image.assert_called_once()
 
 
 @pytest.if_docker_running
 def test_reset_with_docker_terminal(setup_aider_repo):
-    env = AiderBenchmarkEnv()
+    dataset = AiderBenchmarkEnv.load_dataset()
+    task_data = dataset["clock"]
+    env = AiderBenchmarkEnv(task_data=task_data)
     env.add_tool(Toolbox.get_tool("eval"))
     assert isinstance(env.terminal, DockerTerminal)
 
 
@@ -6,13 +6,14 @@
 
 from debug_gym.gym.entities import EvalOutput, Event, Observation
 from debug_gym.gym.envs.env import EnvInfo, EventHooks, RepoEnv, TooledEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.tools.tool import ToolCall
 from debug_gym.gym.tools.toolbox import Toolbox
 
 
 @pytest.fixture
-def env_mock():
-    env = RepoEnv()
+def env_mock(tmp_path):
+    env = LocalEnv(path=tmp_path)
     return env
 
 
@@ -109,7 +110,7 @@ def test_tool_names(env_mock):
     assert env_mock.tool_names == "tool1, tool2"
 
 
-def test_env_tools():
+def test_env_tools(env_mock):
     tool1 = MagicMock()
     tool1.name = "tool1"
     tool1.description = "instructions1"
@@ -129,11 +130,10 @@ def test_env_tools():
         },
     }
 
-    env = RepoEnv()
-    env.add_tool(tool1)
-    env.add_tool(tool2)
+    env_mock.add_tool(tool1)
+    env_mock.add_tool(tool2)
 
-    assert env.tools == [tool1, tool2]
+    assert env_mock.tools == [tool1, tool2]
 
 
 @pytest.fixture
@@ -147,7 +147,7 @@ def env(tmp_path):
     (repo_path / "file2.txt").touch()
     (subdir_path / "subfile1.txt").touch()
 
-    env = RepoEnv(path=repo_path)
+    env = LocalEnv(path=repo_path)
     return env
 
 
@@ -186,7 +186,7 @@ def test_step(
     mock_pdb_tool.current_frame_file = "file.py"
     mock_get_tool.return_value = None
 
-    env = RepoEnv(path=tmp_path)
+    env = LocalEnv(path=tmp_path)
     env.reset()
     env.last_eval = EvalOutput(success=False, output="1 failed, 0 passed")
     tool_call = ToolCall(id="123", name="pdb", arguments={"command": "b 10"})
@@ -210,7 +210,7 @@ def test_reset(tmp_path):
     (tmp_path / "test.py").write_text("def test_1():\n  assert False\n")
     (tmp_path / ".debugignore").write_text("__pycache__/\n.git/\n.pytest_cache/\n")
 
-    env = RepoEnv(path=tmp_path, entrypoint="pytest test.py")
+    env = LocalEnv(path=tmp_path, entrypoint="pytest test.py")
     infos = env.reset()
 
     assert env.last_eval is None
@@ -224,7 +224,7 @@ def test_reset(tmp_path):
         action_reasoning=None,
         action_content=None,
         action_tool_call=None,
-        instructions="",
+        instructions=env.instructions,
         score=0,
         max_score=None,
         terminated=False,
@@ -276,7 +276,7 @@ def test_eval(tmp_path):
     (tmp_path / "test.py").write_text("def test_1():\n  assert False\n")
     (tmp_path / ".debugignore").write_text("__pycache__/\n.git/\n.pytest_cache/\n")
 
-    env = RepoEnv(path=tmp_path, entrypoint="pytest test.py")
+    env = LocalEnv(path=tmp_path, entrypoint="pytest test.py")
     env.reset()
     env.eval()
     assert "FAILED test.py::test_1 - assert False" in env.last_eval.output
@@ -287,7 +287,7 @@ def test_eval_success(tmp_path):
     # create a dummy file
     with open(tmp_path / "file.py", "w") as f:
         f.write("print('Hello, World!')")
-    env = RepoEnv(path=working_dir, entrypoint="python file.py")
+    env = LocalEnv(path=working_dir, entrypoint="python file.py")
     env.reset()
     output = env.eval()
     assert output == EvalOutput(success=True, output="Hello, World!")
@@ -298,7 +298,7 @@ def test_eval_timeout(tmp_path):
     # runs for longer than the timeout
     with open(tmp_path / "file.py", "w") as f:
         f.write("import time; time.sleep(5)")
-    env = RepoEnv(path=working_dir, entrypoint="python file.py", run_timeout=1)
+    env = LocalEnv(path=working_dir, entrypoint="python file.py", run_timeout=1)
     env.reset()
     output = env.eval()
     assert output == EvalOutput(success=False, output="Timeout expired.")
@@ -371,22 +371,20 @@ def test_event_hooks_notify():
     subscriber.on_env_start.assert_called_once()
 
 
-def test_current_breakpoints_no_breakpoints():
-    env = RepoEnv()
-    env.current_breakpoints_state = {}
-    result = env.current_breakpoints()
+def test_current_breakpoints_no_breakpoints(env_mock):
+    env_mock.current_breakpoints_state = {}
+    result = env_mock.current_breakpoints()
     assert result == "No breakpoints are set."
 
 
-def test_current_breakpoints_with_breakpoints(tmp_path):
-    env = RepoEnv()
-    env.current_breakpoints_state = {
+def test_current_breakpoints_with_breakpoints(tmp_path, env_mock):
+    env_mock.current_breakpoints_state = {
         "file1.py|||10": "b file1.py:10",
         "file1.py|||20": "b file1.py:20",
         "file1.py|||30": "b file1.py:30",
         "file2.py|||15": "b file2.py:15",
     }
-    result = env.current_breakpoints()
+    result = env_mock.current_breakpoints()
     expected_result = (
         "line 10 in file1.py\n"
         "line 20 in file1.py\n"
@@ -424,7 +422,7 @@ def test_queue_and_process_events():
 
 
 def test_has_breakpoint_true_and_false(tmp_path):
-    env = RepoEnv(path=tmp_path)
+    env = LocalEnv(path=tmp_path)
     env.reset()
     file_path = env.working_dir / "test.py"
     file_path.write_text("print('hello')")
@@ -438,7 +436,7 @@ def test_has_breakpoint_true_and_false(tmp_path):
 
 
 def test_has_breakpoint_relative_path(tmp_path):
-    env = RepoEnv(path=tmp_path)
+    env = LocalEnv(path=tmp_path)
     env.reset()
     file_path = env.working_dir / "foo.py"
     file_path.write_text("print('foo')")
 
@@ -12,23 +12,23 @@
 def mini_nightmare_env():
     # Initialize the MiniNightmareEnv with LocalTerminal
     terminal = LocalTerminal()
-    env = MiniNightmareEnv(terminal=terminal)
+    dataset = MiniNightmareEnv.load_dataset()
+    task_data = dataset["config"]
+    env = MiniNightmareEnv(task_data=task_data, terminal=terminal)
     env.add_tool(Toolbox.get_tool("eval"))
     return env
 
 
 def test_load_dataset(mini_nightmare_env):
-    dataset = mini_nightmare_env.load_dataset()
-    assert mini_nightmare_env.dataset == dataset
-
+    dataset = MiniNightmareEnv.load_dataset()
     subproblems = list(dataset.keys())[::2]
-    subset = mini_nightmare_env.load_dataset(problems=subproblems)
+    subset = MiniNightmareEnv.load_dataset(problems=subproblems)
     assert list(subset.keys()) == subproblems
 
 
 @patch("debug_gym.gym.envs.mini_nightmare.build_docker_image")
 def test_build_docker_image(mock_build_docker_image):
-    MiniNightmareEnv()
+    dataset = MiniNightmareEnv.load_dataset()
     mock_build_docker_image.assert_called_once()
 
 
@@ -53,11 +53,13 @@ def test_reset(mini_nightmare_env):
 
 @pytest.if_docker_running
 def test_reset_with_docker_terminal():
-    env = MiniNightmareEnv()
+    dataset = MiniNightmareEnv.load_dataset()
+    task_data = dataset["config"]
+    env = MiniNightmareEnv(task_data=task_data)
     env.add_tool(Toolbox.get_tool("eval"))
     assert isinstance(env.terminal, DockerTerminal)
 
-    infos = env.reset(options={"task_name": "config"})
+    infos = env.reset()
     assert env.instructions == infos.step_observation.observation
     assert "2 failed" in infos.eval_observation.observation
     assert infos.max_score == 2