Update run.py and config files

MarcCote · MarcCote · commit a765ee7d16d2 · 2025-12-02T12:58:00.000-08:00
diff --git a/debug_gym/agents/base_agent.py b/debug_gym/agents/base_agent.py
@@ -371,5 +371,5 @@ def create_agent(
     if agent_args is None:
         raise ValueError("Either agent_args or config must be provided.")
 
-    agent = agent_class(args=agent_args, **agent_kwargs)
+    agent = agent_class(agent_args=agent_args, **agent_kwargs)
     return agent
diff --git a/debug_gym/agents/utils.py b/debug_gym/agents/utils.py
@@ -108,15 +108,6 @@ def load_config():
     with open(args.config_file) as reader:
         config = yaml.safe_load(reader)
 
-    # Parse overriden params.
-    for param in args.params:
-        fqn_key, value = param.split("=")
-        entry_to_change = config
-        keys = fqn_key.split(".")
-        for k in keys[:-1]:
-            entry_to_change = entry_to_change[k]
-        entry_to_change[keys[-1]] = yaml.safe_load(value)
-
     available_agents = [item for item in list(config.keys()) if item != "base"]
 
     if not args.agent:
@@ -130,14 +121,25 @@ def load_config():
     if "base" in config:
         # base config is specified (shared across agents)
         return_config = config["base"]
-        agent_specific_config = config[args.agent]
-        for key in agent_specific_config:
-            # override base config with agent specific config
-            return_config[key] = agent_specific_config[key]
+        # Override base config with agent specific config
+        for key, value in config[args.agent].items():
+            return_config[key] = value
     else:
         # base config is not specified
         return_config = config[args.agent]
 
+    # Parse overriden params.
+    for param in args.params:
+        fqn_key, value = param.split("=")
+        entry_to_change = return_config
+        keys = fqn_key.split(".")
+        for k in keys[:-1]:
+            if k not in entry_to_change:
+                entry_to_change[k] = {}
+
+            entry_to_change = entry_to_change[k]
+        entry_to_change[keys[-1]] = yaml.safe_load(value)
+
     # assume agent type is the key if not specified by the user
     if not return_config.get("agent_type"):
         return_config["agent_type"] = args.agent
diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py
@@ -6,12 +6,11 @@
 from debug_gym.gym.envs.swe_bench import SWEBenchEnv
 from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv
 from debug_gym.gym.envs.swe_smith import SWESmithEnv
+from debug_gym.logger import DebugGymLogger
 
 
 def select_env(env_type: str = None) -> type[RepoEnv]:
     match env_type:
-        case None:
-            return RepoEnv
         case "local":
             return LocalEnv
         case "aider":
@@ -27,4 +26,20 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
         case "r2egym":
             return R2EGymEnv
         case _:
-            raise ValueError(f"Unknown benchmark {env_type}")
+            raise ValueError(f"Unknown environment {env_type}")
+
+
+def load_dataset(config: dict, logger: DebugGymLogger | None = None) -> dict:
+    """Load dataset based on the given config."""
+    if config.get("type") is None:
+        raise ValueError("Dataset config must specify 'type' field.")
+
+    try:
+        env = select_env(config.get("type"))
+    except ValueError as e:
+        raise ValueError(
+            f"Unknown environment type '{config.get('type')}' from dataset's config: {config}"
+        )
+
+    dataset = env.load_dataset(logger=logger, **config)
+    return dataset
diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py
@@ -138,6 +138,7 @@ def load_dataset(
         problems: str | list[str] | None = None,
         build_image: bool = True,
         logger: object = None,
+        **kwargs,
     ) -> dict:
         if build_image:
             build_docker_image(logger)
@@ -184,4 +185,9 @@ def load_dataset(
 
         problems = utils.filter_problems(dataset, problems)
         dataset = {id: data for id, data in dataset.items() if id in problems}
+
+        # Add env_type to each task_data.
+        for task_data in dataset.values():
+            task_data["env_type"] = "aider"
+
         return dataset
diff --git a/debug_gym/gym/envs/mini_nightmare.py b/debug_gym/gym/envs/mini_nightmare.py
@@ -152,6 +152,7 @@ def load_dataset(
         problems: str | list[str] | None = None,
         build_image: bool = True,
         logger: object = None,
+        **kwargs,
     ) -> dict:
         if build_image:
             build_docker_image(logger)
@@ -180,4 +181,9 @@ def load_dataset(
 
         problems = utils.filter_problems(dataset, problems)
         dataset = {id: data for id, data in dataset.items() if id in problems}
+
+        # Add env_type to each task_data.
+        for task_data in dataset.values():
+            task_data["env_type"] = "mini_nightmare"
+
         return dataset
diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py
@@ -262,6 +262,7 @@ def load_dataset(
         problems: list | None = None,
         prepull_images: bool = False,
         logger: DebugGymLogger | None = None,
+        **kwargs,
     ) -> dict:
         logger = logger or DebugGymLogger("debug_gym")
         data_path = Path(dataset_id)
@@ -297,9 +298,10 @@ def extract_instance_id(docker_image: str) -> str:
         problems = filter_problems(id2idx, problems, custom_splits, excluded_ids)
         dataset = {problem: ds[id2idx[problem]] for problem in problems}
 
-        # add instance id to each example (name of the image)
+        # Add instance_id (name of the image) and env_type to each task_data.
         for instance_id, task_data in dataset.items():
             task_data["instance_id"] = instance_id
+            task_data["env_type"] = "r2egym"
 
         image_names = set(task_data["docker_image"] for task_data in dataset.values())
         logger.debug(
diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py
@@ -182,6 +182,7 @@ def load_dataset(
         problems: list | None = None,
         prepull_images: bool = False,
         logger: DebugGymLogger | None = None,
+        **kwargs,
     ) -> dict:
         ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split]
 
@@ -190,6 +191,10 @@ def load_dataset(
         problems = filter_problems(id2idx, problems)
         dataset = {problem: ds[id2idx[problem]] for problem in problems}
 
+        # Add env_type to each task_data.
+        for task_data in dataset.values():
+            task_data["env_type"] = "swebench"
+
         image_names = set(
             f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in dataset
         )
diff --git a/debug_gym/gym/envs/swe_bench_debug.py b/debug_gym/gym/envs/swe_bench_debug.py
@@ -15,3 +15,13 @@ def eval(self, **kwargs) -> EvalOutput:
         success, output = self.terminal.run(self.entrypoint, timeout=self.run_timeout)
         self.last_eval = EvalOutput(success, output)
         return self.last_eval
+
+    @classmethod
+    def load_dataset(*args, **kwargs) -> dict:
+        dataset = SWEBenchEnv.load_dataset(*args, **kwargs)
+
+        # Add env_type to each task_data.
+        for task_data in dataset.values():
+            task_data["env_type"] = "swebench-debug"
+
+        return dataset
diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py
@@ -153,6 +153,7 @@ def load_dataset(
         problems: list | None = None,
         prepull_images: bool = False,
         logger: DebugGymLogger | None = None,
+        **kwargs,
     ) -> dict:
         logger = logger or DebugGymLogger("debug_gym")
         data_path = Path(dataset_id)
@@ -182,6 +183,10 @@ def load_dataset(
         problems = filter_problems(id2idx, problems, custom_splits, excluded_ids)
         dataset = {problem: ds[id2idx[problem]] for problem in problems}
 
+        # Add env_type to each task_data.
+        for task_data in dataset.values():
+            task_data["env_type"] = "swesmith"
+
         image_names = set(task_data["image_name"] for task_data in dataset.values())
         logger.debug(
             f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}."
diff --git a/scripts/config_aider.yaml b/scripts/config_aider.yaml
@@ -1,14 +1,17 @@
 base:
     # Environment configs
     output_path: "exps/aider"
-    benchmark: "aider"
-    problems: "all"  # list of problems, e.g., ["wordy"], or "all"
+
     env:
-        type: "aider"
         run_timeout: 20
+
     terminal:
         type: "docker"  # "docker", "kubernetes", or "local"
 
+    dataset:
+        type: "aider"
+        problems: "all"  # list of problems, e.g., ["wordy"], or "all"
+
     # LLM configs
     llm_name: "gpt-4o"
 
diff --git a/scripts/config_mini_nightmare.yaml b/scripts/config_mini_nightmare.yaml
@@ -1,14 +1,17 @@
 base:
     # Environment configs
     output_path: "exps/mini_nightmare"
-    benchmark: "mini_nightmare"
-    problems: "all"  # list of problems, e.g., ["config"], or "all"
+
     env:
-        type: "mini_nightmare"
         run_timeout: 30
+
     terminal:
         type: "docker"  # "docker", "kubernetes", or "local"
 
+    dataset:
+        type: "mini_nightmare"
+        problems: "all"  # list of problems, e.g., ["config"], or "all"
+
     # LLM configs
     llm_name: "gpt-4o"
 
diff --git a/scripts/config_swebench.yaml b/scripts/config_swebench.yaml
@@ -1,15 +1,18 @@
 base:
     # Environment configs
     output_path: "exps/swebench-verified"
-    benchmark: "swebench-debug"
-    problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or "all"
+
     env:
-        type: "swebench-debug"
         run_timeout: 300
+
+    dataset:
+        type: "swebench"
+        problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or "all"
         dataset_id: "SWE-bench/SWE-bench_Verified"
         dataset_revision: "99450355ca8c611021187a57ffac304b66666738"
+
     terminal:
-        type: "docker"  # "docker", "kubernetes"
+        type: "docker"  # "docker" or "kubernetes"
 
     # LLM configs
     llm_name: "gpt-4o"
@@ -28,34 +31,6 @@ base:
     "show_current_breakpoints": False  # If True, the environment will automatically show the current breakpoints at every step in the system prompt.
     "show_directory_tree": 0  # Value indicated the depth of the directory shown in the system prompt. 0 means no directory tree is shown.
 
-rewrite_agent:
-    tools:
-        - grep
-        - view
-        - rewrite
-        - listdir
-        - eval:
-            auto_eval_on_rewrite: False  # If True, the environment will automatically call the Eval tool after a successful rewrite. If this is set to True, the agent does not need to call the Eval tool itself.
-
-debug_agent:
-    tools:
-        - grep
-        - pdb
-        - view
-        - rewrite
-        - listdir
-        - eval
-
-debug_5_agent:
-    n_rewrites_before_pdb: 5
-    tools:
-        - grep
-        - pdb
-        - view
-        - rewrite
-        - listdir
-        - eval
-
 solution_agent:
     llm_name: null  # No need for an LLM.
     tools:
@@ -64,9 +39,6 @@ solution_agent:
         - submit
 
 swe_agent:
-    benchmark: "swebench"
-    env:
-        type: "swebench"
     max_steps: 100
     max_rewrite_steps: 20
     tools:
diff --git a/scripts/config_swebench_debug.yaml b/scripts/config_swebench_debug.yaml
@@ -0,0 +1,67 @@
+base:
+    # Environment configs
+    output_path: "exps/swebench-verified-debugmode"
+
+    env:
+        run_timeout: 300
+
+    dataset:
+        type: "swebench-debug"
+        problems: "all"  # list of problems, e.g., ["astropy__astropy-12907"], or "all"
+        dataset_id: "SWE-bench/SWE-bench_Verified"
+        dataset_revision: "99450355ca8c611021187a57ffac304b66666738"
+
+    terminal:
+        type: "docker"  # "docker" or "kubernetes"
+
+    # LLM configs
+    llm_name: "gpt-4o"
+
+    # Agent configs
+    random_seed: 42
+    max_steps: 50
+    max_rewrite_steps: 10
+    memory_size: 20
+    save_patch: True
+    reset_prompt_history_after_rewrite: False
+    # Optionally loads a custom system prompt template from a file.
+    # system_prompt_template_file: "script/templates/system_prompt.jinja"
+
+    # Shortcut features
+    "show_current_breakpoints": False  # If True, the environment will automatically show the current breakpoints at every step in the system prompt.
+    "show_directory_tree": 0  # Value indicated the depth of the directory shown in the system prompt. 0 means no directory tree is shown.
+
+rewrite_agent:
+    tools:
+        - grep
+        - view
+        - rewrite
+        - listdir
+        - eval:
+            auto_eval_on_rewrite: False  # If True, the environment will automatically call the Eval tool after a successful rewrite. If this is set to True, the agent does not need to call the Eval tool itself.
+
+debug_agent:
+    tools:
+        - grep
+        - pdb
+        - view
+        - rewrite
+        - listdir
+        - eval
+
+debug_5_agent:
+    n_rewrites_before_pdb: 5
+    tools:
+        - grep
+        - pdb
+        - view
+        - rewrite
+        - listdir
+        - eval
+
+solution_agent:
+    llm_name: null  # No need for an LLM.
+    tools:
+        - eval
+        - pdb
+        - submit
diff --git a/scripts/config_swesmith.yaml b/scripts/config_swesmith.yaml
diff --git a/scripts/run.py b/scripts/run.py