diff --git a/.github/actions/test-if-changes/action.yml b/.github/actions/test-if-changes/action.yml index a2c872cf..deb254c1 100644 --- a/.github/actions/test-if-changes/action.yml +++ b/.github/actions/test-if-changes/action.yml @@ -39,11 +39,13 @@ runs: else pip install "debug-gym[dev]==${{ inputs.version }}" fi + df -h - name: Run tests env: DEBUG_GYM_DEBUG: 1 shell: bash run: | + free -h pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing - name: Store coverage report uses: actions/upload-artifact@v4 diff --git a/debug_gym/agents/base_agent.py b/debug_gym/agents/base_agent.py index 767976a3..149a726a 100644 --- a/debug_gym/agents/base_agent.py +++ b/debug_gym/agents/base_agent.py @@ -371,5 +371,5 @@ def create_agent( if agent_args is None: raise ValueError("Either agent_args or config must be provided.") - agent = agent_class(args=agent_args, **agent_kwargs) + agent = agent_class(agent_args=agent_args, **agent_kwargs) return agent diff --git a/debug_gym/agents/solution_agent.py b/debug_gym/agents/solution_agent.py index e0f31af8..f5704635 100644 --- a/debug_gym/agents/solution_agent.py +++ b/debug_gym/agents/solution_agent.py @@ -39,28 +39,36 @@ def run(self, env, debug=False): return True self.logger.info(f"Score: {info.score}/{info.max_score or '-'}") - # Make a simple pdb call to make sure it is working. - action = ToolCall(name="pdb", id="pdb", arguments={"command": "help help"}) - pdb_help_info = self.env.step(action, None, None) - assert "h(elp)" in pdb_help_info.step_observation.observation, ( - "PDB command did not return expected help message.\n" - f"{pdb_help_info.step_observation.observation}" - ) - # Send a pdb continue command, and check the output matches the one from env.reset. - action = ToolCall(name="pdb", id="pdb", arguments={"command": "continue"}) - pdb_continue_info = self.env.step(action, None, None) - - assert ( - "Reached the end of the program. Restarting the debugging session." - in pdb_continue_info.step_observation.observation - ) or ( - info.step_observation.observation.splitlines()[-1] - in pdb_continue_info.step_observation.observation - ), ( - "PDB command did not return expected continue message.\n" - f"{pdb_continue_info.step_observation.observation}" - ) + if env.has_tool("pdb"): + # Make a simple pdb call to make sure it is working. + action = ToolCall( + name="pdb", id="pdb", arguments={"command": "help help"} + ) + pdb_help_info = self.env.step(action, None, None) + assert "h(elp)" in pdb_help_info.step_observation.observation, ( + "PDB command did not return expected help message.\n" + f"{pdb_help_info.step_observation.observation}" + ) + + # Send a pdb continue command, and check the output matches the one from env.reset. + action = ToolCall( + name="pdb", id="pdb", arguments={"command": "continue"} + ) + pdb_continue_info = self.env.step(action, None, None) + + pdb_observation = pdb_continue_info.step_observation.observation + expected_messages = [ + "Reached the end of the program. Restarting the debugging session.", + "Uncaught exception. Entering post mortem debugging", + ] + reset_observation = info.step_observation.observation + if reset_observation.splitlines(): + expected_messages.append(reset_observation.splitlines()[-1]) + + assert any( + msg in pdb_observation for msg in expected_messages + ), f"PDB command did not return expected continue message.\n{pdb_observation}" self.env.apply_gold_patch() diff --git a/debug_gym/agents/utils.py b/debug_gym/agents/utils.py index 65578d5c..eae7dced 100644 --- a/debug_gym/agents/utils.py +++ b/debug_gym/agents/utils.py @@ -108,15 +108,6 @@ def load_config(): with open(args.config_file) as reader: config = yaml.safe_load(reader) - # Parse overriden params. - for param in args.params: - fqn_key, value = param.split("=") - entry_to_change = config - keys = fqn_key.split(".") - for k in keys[:-1]: - entry_to_change = entry_to_change[k] - entry_to_change[keys[-1]] = yaml.safe_load(value) - available_agents = [item for item in list(config.keys()) if item != "base"] if not args.agent: @@ -130,14 +121,25 @@ def load_config(): if "base" in config: # base config is specified (shared across agents) return_config = config["base"] - agent_specific_config = config[args.agent] - for key in agent_specific_config: - # override base config with agent specific config - return_config[key] = agent_specific_config[key] + # Override base config with agent specific config + for key, value in config[args.agent].items(): + return_config[key] = value else: # base config is not specified return_config = config[args.agent] + # Parse overriden params. + for param in args.params: + fqn_key, value = param.split("=") + entry_to_change = return_config + keys = fqn_key.split(".") + for k in keys[:-1]: + if k not in entry_to_change: + entry_to_change[k] = {} + + entry_to_change = entry_to_change[k] + entry_to_change[keys[-1]] = yaml.safe_load(value) + # assume agent type is the key if not specified by the user if not return_config.get("agent_type"): return_config["agent_type"] = args.agent diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py index 86ef4cab..91cc7d5b 100644 --- a/debug_gym/gym/envs/__init__.py +++ b/debug_gym/gym/envs/__init__.py @@ -1,16 +1,18 @@ from debug_gym.gym.envs.aider import AiderBenchmarkEnv from debug_gym.gym.envs.env import RepoEnv, TooledEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv from debug_gym.gym.envs.r2egym import R2EGymEnv from debug_gym.gym.envs.swe_bench import SWEBenchEnv from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv from debug_gym.gym.envs.swe_smith import SWESmithEnv +from debug_gym.logger import DebugGymLogger def select_env(env_type: str = None) -> type[RepoEnv]: match env_type: - case None: - return RepoEnv + case "local": + return LocalEnv case "aider": return AiderBenchmarkEnv case "swebench": @@ -24,4 +26,20 @@ def select_env(env_type: str = None) -> type[RepoEnv]: case "r2egym": return R2EGymEnv case _: - raise ValueError(f"Unknown benchmark {env_type}") + raise ValueError(f"Unknown environment {env_type}") + + +def load_dataset(config: dict, logger: DebugGymLogger | None = None) -> dict: + """Load dataset based on the given config.""" + if config.get("type") is None: + raise ValueError("Dataset config must specify 'type' field.") + + try: + env = select_env(config.get("type")) + except ValueError as e: + raise ValueError( + f"Unknown environment type '{config.get('type')}' from dataset's config: {config}" + ) + + dataset = env.load_dataset(logger=logger, **config) + return dataset diff --git a/debug_gym/gym/envs/aider.py b/debug_gym/gym/envs/aider.py index 26776448..aacf71e4 100644 --- a/debug_gym/gym/envs/aider.py +++ b/debug_gym/gym/envs/aider.py @@ -1,3 +1,4 @@ +import logging import os import subprocess import tempfile @@ -7,16 +8,20 @@ from debug_gym.constants import DEBUG_GYM_CACHE_DIR from debug_gym.gym.entities import EvalOutput from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.logger import DebugGymLogger DOCKER_AIDER_IMAGE_NAME = "debug-gym:aider" -def build_docker_image(logger): +def build_docker_image(logger: logging.Logger | None = None): """ Build a Docker image for the Mini Nightmare environment. """ + logger = logger or DebugGymLogger("debug-gym") + # Check if Docker image is built. import docker @@ -62,6 +67,7 @@ class AiderBenchmarkEnv(RepoEnv): def __init__( self, + task_data: dict, entrypoint: str = "python -m pytest --tb=no -s .", terminal: Terminal | None = None, **kwargs, @@ -73,7 +79,13 @@ def __init__( if hasattr(terminal, "base_image") and terminal.base_image is None: terminal.base_image = DOCKER_AIDER_IMAGE_NAME - super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs) + super().__init__( + task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs + ) + + @property + def task_name(self) -> str: + return self.current_task["task_name"] @property def instructions(self) -> str: @@ -91,10 +103,8 @@ def eval(self, **kwargs) -> EvalOutput: self.last_eval = EvalOutput(success, output) return self.last_eval - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError(f"Task {task_name} not found in the dataset.") - self.current_task = self.dataset[task_name] + def setup_task(self): + self.current_task = self.task_data def setup_workspace(self): self.workspace.reset() @@ -122,14 +132,21 @@ def setup_terminal(self): ) # Aider tasks come with those. self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'") - def load_dataset(self, problems: str | list[str] | None = None): - if isinstance(self.terminal, DockerTerminal): - build_docker_image(self.logger) + @classmethod + def load_dataset( + cls, + problems: str | list[str] | None = None, + build_image: bool = True, + logger: object = None, + **kwargs, + ) -> dict: + if build_image: + build_docker_image(logger) - if not os.path.exists(self.REPO_PATH): - subprocess.run(["git", "clone", self.REPO_URL, self.REPO_PATH], check=True) + if not os.path.exists(cls.REPO_PATH): + subprocess.run(["git", "clone", cls.REPO_URL, cls.REPO_PATH], check=True) - practice_path = self.REPO_PATH / "exercises" / "practice" + practice_path = cls.REPO_PATH / "exercises" / "practice" directories = [d for d in practice_path.iterdir() if d.is_dir()] dataset = {} @@ -160,11 +177,17 @@ def load_dataset(self, problems: str | list[str] | None = None): ) dataset[task_name] = { + "task_name": task_name, "codebase": directory, "instructions": instructions, "filename": task_name + ".py", } problems = utils.filter_problems(dataset, problems) - dataset = {id: i for id, i in dataset.items() if id in problems} + dataset = {id: data for id, data in dataset.items() if id in problems} + + # Add env_type to each task_data. + for task_data in dataset.values(): + task_data["env_type"] = "aider" + return dataset diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py index 48807546..13b54d76 100644 --- a/debug_gym/gym/envs/env.py +++ b/debug_gym/gym/envs/env.py @@ -201,41 +201,29 @@ class RepoEnv(TooledEnv): def __init__( self, - path: str | None = None, + task_data: dict, entrypoint: str = "python -m pytest -sq .", debug_entrypoint: str | None = None, max_score: int | None = None, - readonly_patterns: list[str] | None = None, # TODO: remove run_timeout: int | None = None, terminal: Terminal | None = None, logger: DebugGymLogger | None = None, - problems: str | list[str] | None = None, **kwargs, ): super().__init__() - self.path = path + self.task_data = task_data self.max_score = max_score self.run_timeout = run_timeout - self.terminal = terminal or LocalTerminal() # TODO: default to DockerTerminal + self.terminal = terminal self._entrypoint = entrypoint self._debug_entrypoint = debug_entrypoint self.logger = logger or DebugGymLogger("debug-gym") self.infos: EnvInfo | None = None self.rng = None self.additional_kwargs = kwargs - self.task_name: str | None = None - self.options: dict = {} - - if "auto_eval_on_rewrite" in kwargs: - raise ValueError( - "The 'auto_eval_on_rewrite' parameter is no longer supported. " - "Please remove it from your initialization arguments." - "Instead, set 'auto_eval_on_rewrite' in the EvalTool instance." - ) self.workspace = Workspace(self.terminal, logger=self.logger) - self.dataset = self.load_dataset(problems) self.set_entrypoints(self._entrypoint, self._debug_entrypoint) def _reset_env_state(self): @@ -290,45 +278,39 @@ def working_dir(self) -> Path: def instructions(self) -> str: """Instructions for the current task. Override in subclasses for different behavior.""" - return "" + raise NotImplementedError( + "Subclasses must implement the instructions property." + ) - def setup_task(self, task_name: str, options: dict = None) -> None: + @property + def task_name(self) -> str: + raise NotImplementedError("Subclasses must implement the task_name property.") + + def setup_task(self) -> None: """Setup the task information. Override in subclasses for different behavior. Called once at reset.""" - pass + raise NotImplementedError("Subclasses must implement setup_task method.") def setup_workspace(self) -> None: """Setup the workspace. Override in subclasses for different behavior. Called once at reset.""" - self.workspace.reset() - self.workspace.copy_content(self.path) - self.workspace.setup_file_filters() + raise NotImplementedError("Subclasses must implement setup_workspace method.") def setup_terminal(self) -> None: """Setup the terminal. Override in subclasses for different behavior. Called once at reset.""" - - self.logger.debug(f"Configuring {self.terminal}...") - - self.terminal.run("git init -b main") - self.terminal.run("git config user.name 'debug-gym'") - self.terminal.run("git config user.email '<>'") - - self.terminal.run("git add *") - self.terminal.run("git commit -am 'Init'") - - self.terminal.run("git add .debugignore .debugreadonly") - self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'") + raise NotImplementedError("Subclasses must implement setup_terminal method.") def reset(self, *, options: dict = None): """Resets the environment and returns eval as the initial observation.""" - self.options = options if options is not None else self.options + options = options if options is not None else {} self.logger.debug("Resetting environment") - self.close() # Clean up previous workspace and terminal. - self.task_name = self.options.get("task_name") - self.setup_task(task_name=self.task_name, options=self.options) - self.setup_workspace() - self.setup_terminal() + if options.get("reset_runtime", True): + self.close() # Clean up previous workspace and terminal. + self.setup_task() + self.setup_workspace() + self.setup_terminal() + self._reset_env_state() # Notify all tools that the environment is reset and get their observations @@ -504,6 +486,3 @@ def close(self): def __del__(self): self.close() - - def load_dataset(self, problems: str | list[str] | None = None): - return {"custom": None} diff --git a/debug_gym/gym/envs/local.py b/debug_gym/gym/envs/local.py new file mode 100644 index 00000000..cf9d5c3e --- /dev/null +++ b/debug_gym/gym/envs/local.py @@ -0,0 +1,57 @@ +from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.terminals.local import LocalTerminal +from debug_gym.gym.terminals.terminal import Terminal + + +class LocalEnv(RepoEnv): + + def __init__( + self, + path: str, + terminal: Terminal | None = None, + entrypoint: str = "python -m pytest -sq .", + debug_entrypoint: str | None = None, + **kwargs, + ): + task_data = {"path": path} + terminal = terminal or LocalTerminal() + super().__init__( + task_data=task_data, + terminal=terminal, + entrypoint=entrypoint, + debug_entrypoint=debug_entrypoint, + **kwargs, + ) + + @property + def instructions(self) -> str: + return f"Investigate the current repository, run the tests to figure out any issues, then rewrite the code to fix them." + + @property + def task(self) -> str: + return self.task_data["path"].split("/")[-1] + + def setup_task(self) -> None: + """Setup the task information. Called once at reset.""" + self.path = self.task_data["path"] + + def setup_workspace(self) -> None: + """Setup the workspace. Called once at reset.""" + self.workspace.reset() + self.workspace.copy_content(self.path) + self.workspace.setup_file_filters() + + def setup_terminal(self) -> None: + """Setup the terminal. Called once at reset.""" + + self.logger.debug(f"Configuring {self.terminal}...") + + self.terminal.run("git init -b main") + self.terminal.run("git config user.name 'debug-gym'") + self.terminal.run("git config user.email '<>'") + + self.terminal.run("git add *") + self.terminal.run("git commit -am 'Init'") + + self.terminal.run("git add .debugignore .debugreadonly") + self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'") diff --git a/debug_gym/gym/envs/mini_nightmare.py b/debug_gym/gym/envs/mini_nightmare.py index b5cee0a8..4afe9174 100644 --- a/debug_gym/gym/envs/mini_nightmare.py +++ b/debug_gym/gym/envs/mini_nightmare.py @@ -1,3 +1,4 @@ +import logging import tempfile from pathlib import Path @@ -7,14 +8,16 @@ from debug_gym.gym.envs.env import RepoEnv from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.logger import DebugGymLogger DOCKER_MINI_NIGHTMARE_IMAGE_NAME = "debug-gym:mini-nightmare" -def build_docker_image(logger): +def build_docker_image(logger: logging.Logger | None = None): """ Build a Docker image for the Mini Nightmare environment. """ + logger = logger or DebugGymLogger("debug-gym") # Check if Docker image is built. import docker @@ -74,6 +77,7 @@ class MiniNightmareEnv(RepoEnv): def __init__( self, + task_data: dict, entrypoint: str = "python -m pytest --tb=no -s test.py", terminal: Terminal | None = None, **kwargs, @@ -85,7 +89,9 @@ def __init__( if hasattr(terminal, "base_image") and terminal.base_image is None: terminal.base_image = DOCKER_MINI_NIGHTMARE_IMAGE_NAME - super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs) + super().__init__( + task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs + ) @property def instructions(self) -> str: @@ -95,6 +101,10 @@ def instructions(self) -> str: " Beaware that the bug may not be in the code you initially see." ) + @property + def task_name(self) -> str: + return self.current_task["task_name"] + def calculate_max_score(self, eval_output: EvalOutput) -> int: return utils.extract_max_score_from_pytest_output(eval_output.output) @@ -107,10 +117,8 @@ def eval(self, **kwargs) -> EvalOutput: self.last_eval = EvalOutput(success, output) return self.last_eval - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError(f"Task {task_name} not found in the dataset.") - self.current_task = self.dataset[task_name] + def setup_task(self): + self.current_task = self.task_data def setup_workspace(self): self.workspace.reset() @@ -138,29 +146,44 @@ def setup_terminal(self): ) # Mini-nightmare tasks come with those. self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'") - def load_dataset(self, problems: str | list[str] | None = None): - if isinstance(self.terminal, DockerTerminal): - build_docker_image(self.logger) + @classmethod + def load_dataset( + cls, + problems: str | list[str] | None = None, + build_image: bool = True, + logger: object = None, + **kwargs, + ) -> dict: + if build_image: + build_docker_image(logger) - if not self.DATA_PATH.exists(): + if not MiniNightmareEnv.DATA_PATH.exists(): zipped_data = utils.download( - self.DATA_URL, self.DATA_PATH, f"Downloading mini-nightmare dataset." + MiniNightmareEnv.DATA_URL, + MiniNightmareEnv.DATA_PATH, + f"Downloading mini-nightmare dataset.", ) - utils.unzip(zipped_data, dst=self.DATA_PATH.parent) + utils.unzip(zipped_data, dst=cls.DATA_PATH.parent) dataset = {} - for task_name in self.TASK_NAMES: - task_path = self.DATA_PATH / task_name + for task_name in cls.TASK_NAMES: + task_path = cls.DATA_PATH / task_name assert (task_path / "test.py").exists() assert (task_path / f"{task_name}_code.py").exists() assert (task_path / ".debugignore").exists() assert (task_path / ".debugreadonly").exists() dataset[task_name] = { + "task_name": task_name, "codebase": task_path, "filename": task_name + "_code.py", } problems = utils.filter_problems(dataset, problems) - dataset = {id: i for id, i in dataset.items() if id in problems} + dataset = {id: data for id, data in dataset.items() if id in problems} + + # Add env_type to each task_data. + for task_data in dataset.values(): + task_data["env_type"] = "mini_nightmare" + return dataset diff --git a/debug_gym/gym/envs/r2egym.py b/debug_gym/gym/envs/r2egym.py index 47bc135c..8429a8eb 100644 --- a/debug_gym/gym/envs/r2egym.py +++ b/debug_gym/gym/envs/r2egym.py @@ -1,4 +1,5 @@ import json +import logging import re from importlib.resources import files as importlib_files from pathlib import Path @@ -14,6 +15,7 @@ from debug_gym.gym.terminals.kubernetes import KubernetesTerminal from debug_gym.gym.terminals.terminal import Terminal from debug_gym.gym.utils import filter_problems +from debug_gym.logger import DebugGymLogger def decolor_dict_keys(key): @@ -64,9 +66,7 @@ class R2EGymEnv(RepoEnv): def __init__( self, - dataset_id: str = "R2E-Gym/R2E-Gym-Lite", - dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5", - split: str = "train", + task_data: dict, terminal: Terminal | None = None, **kwargs, ): @@ -76,100 +76,38 @@ def __init__( "R2EGymEnv only supports DockerTerminal and KubernetesTerminal." ) - self.dataset_id = dataset_id - self.dataset_revision = dataset_revision - self.split = split + super().__init__(task_data=task_data, terminal=terminal, **kwargs) self.session_commands = [] - super().__init__(terminal=terminal, **kwargs) + @property + def task_name(self) -> str: + return self.task_data["instance_id"] @property def instructions(self) -> str: # try getting the content inside of [ISSUE] [/ISSUE] using regex tags for ds['problem_statement'] else return ds['problem_statement'] # ref: https://github.com/R2E-Gym/R2E-Gym/blob/main/src/r2egym/agenthub/runtime/docker.py#L592 try: - content = self.ds_row["problem_statement"] + content = self.task_data["problem_statement"] return re.search(r"\[ISSUE\](.*)\[/ISSUE\]", content, re.DOTALL).group(1) except Exception as e: - return self.ds_row["problem_statement"] - - def load_dataset(self, problems: str | list[str] | None = None): - data_path = Path(self.dataset_id) - if data_path.is_file(): - # Loading from local file. - if data_path.suffix.lower() == ".json": - self.ds = load_dataset("json", data_files=self.dataset_id) - elif data_path.suffix.lower() == ".parquet": - self.ds = load_dataset("parquet", data_files=self.dataset_id) - elif data_path.is_dir(): - # Loading from local folder. - self.ds = load_from_disk(self.dataset_id) - else: - # Loading from HuggingFace or a folder. - self.ds = load_dataset(self.dataset_id, revision=self.dataset_revision) - - # Select the split. - self.ds = self.ds[self.split] - - # Load custom dataset splits from config. - with open(R2EGymEnv.CONFIG) as f: - custom_splits = yaml.safe_load(f) - excluded_ids = custom_splits.get("excluded", []) - - dataset = { - id.split("/", 1)[-1]: i for i, id in enumerate(self.ds["docker_image"]) - } - problems = filter_problems(dataset, problems, custom_splits, excluded_ids) - dataset = {id: i for id, i in dataset.items() if id in problems} - - image_names = set(self.ds[dataset[id]]["docker_image"] for id in dataset) - self.logger.debug( - f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {self.dataset_id}." - ) - - if not isinstance(self.terminal, KubernetesTerminal): - # Download all images needed for R2E-Gym. - client = docker.from_env() - - existing_images = set( - tag for image in client.images.list() for tag in image.tags - ) - missing_images = image_names - existing_images - if missing_images: - self.logger.warning( - f"Found {len(missing_images)} missing Docker images." - ) - for i, image_name in enumerate(missing_images): - self.logger.warning( - f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`." - ) - client.images.pull(image_name) - - return dataset - - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError( - f"Task `{task_name}` was not found in dataset. The available tasks are: {self.dataset}.\n" - "Please provide a valid task or initialize the environment without problems to load all tasks." - ) + return self.task_data["problem_statement"] - self.task_name = task_name - self.ds_row = self.ds[self.dataset[self.task_name]] - self.base_image = self.ds_row["docker_image"] - self.package_name = self.ds_row["repo_name"] - self.expected_output = json.loads(self.ds_row["expected_output_json"]) + def setup_task(self): + self.base_image = self.task_data["docker_image"] + self.package_name = self.task_data["repo_name"] + self.expected_output = json.loads(self.task_data["expected_output_json"]) self.expected_output = decolor_dict_keys(self.expected_output) self.expected_output = { k.split(" - ")[0]: self.expected_output[k] for k in sorted(self.expected_output.keys()) } - self.commit_hash = self.ds_row["commit_hash"] + self.commit_hash = self.task_data["commit_hash"] self.entrypoint = "python -m pytest -W ignore -rA r2e_tests" if self.package_name == "pillow": - test_file_codes = json.loads(self.ds_row["execution_result_content"])[ + test_file_codes = json.loads(self.task_data["execution_result_content"])[ "test_file_codes" ] if any(["unittest" in test_code for test_code in test_file_codes]): @@ -314,3 +252,75 @@ def calculate_score(self, eval_output: EvalOutput) -> int: reward = 1 if match else 0 return reward + + @classmethod + def load_dataset( + cls, + dataset_id: str = "R2E-Gym/R2E-Gym-Lite", + dataset_revision: str = "8d3163011f01f9393bb3dc7700497a79a8686ae5", + split: str = "train", + problems: list | None = None, + prepull_images: bool = False, + logger: DebugGymLogger | None = None, + **kwargs, + ) -> dict: + logger = logger or DebugGymLogger("debug_gym") + data_path = Path(dataset_id) + + if data_path.is_file(): + # Loading from local file. + if data_path.suffix.lower() == ".json": + ds = load_dataset("json", data_files=dataset_id) + elif data_path.suffix.lower() == ".parquet": + ds = load_dataset("parquet", data_files=dataset_id) + elif data_path.is_dir(): + # Loading from local folder. + ds = load_from_disk(dataset_id) + else: + # Loading from HuggingFace or a folder. + ds = load_dataset(dataset_id, revision=dataset_revision) + + # Select the split. + ds = ds[split] + + # Load custom dataset splits from config. + with open(R2EGymEnv.CONFIG) as f: + custom_splits = yaml.safe_load(f) + excluded_ids = custom_splits.get("excluded", []) + + def extract_instance_id(docker_image: str) -> str: + return docker_image.split("/", 1)[-1] + + id2idx = { + extract_instance_id(docker_image): i + for i, docker_image in enumerate(ds["docker_image"]) + } + problems = filter_problems(id2idx, problems, custom_splits, excluded_ids) + dataset = {problem: ds[id2idx[problem]] for problem in problems} + + # Add instance_id (name of the image) and env_type to each task_data. + for instance_id, task_data in dataset.items(): + task_data["instance_id"] = instance_id + task_data["env_type"] = "r2egym" + + image_names = set(task_data["docker_image"] for task_data in dataset.values()) + logger.debug( + f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}." + ) + + if prepull_images: + # Download all images needed for R2E-Gym. + client = docker.from_env() + + existing_images = set( + tag for image in client.images.list() for tag in image.tags + ) + missing_images = image_names - existing_images + if missing_images: + logger.warning(f"Found {len(missing_images)} missing Docker images.") + for i, image_name in enumerate(missing_images): + logger.warning( + f"Pulling Docker image {i + 1}/{len(missing_images)} `{image_name}`." + ) + client.images.pull(image_name) + return dataset diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py index b438dbe0..5044f0e6 100644 --- a/debug_gym/gym/envs/swe_bench.py +++ b/debug_gym/gym/envs/swe_bench.py @@ -12,7 +12,7 @@ from debug_gym.gym.envs.env import RepoEnv from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.terminals.kubernetes import KubernetesTerminal -from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.gym.terminals.terminal import DebugGymLogger, Terminal from debug_gym.gym.utils import filter_problems @@ -21,9 +21,7 @@ class SWEBenchEnv(RepoEnv): def __init__( self, - dataset_id: str = "SWE-bench/SWE-bench_Verified", - dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738", - split: str = "test", + task_data: dict, terminal: Terminal | None = None, **kwargs, ): @@ -33,81 +31,37 @@ def __init__( f"{self.__class__.__name__} only supports DockerTerminal and KubernetesTerminal." ) - self.dataset_id = dataset_id - self.dataset_revision = dataset_revision - self.split = split self.test_directives = [] - - super().__init__(terminal=terminal, **kwargs) + super().__init__(task_data=task_data, terminal=terminal, **kwargs) @property def instructions(self) -> str: - return self.ds_row["problem_statement"] - - def load_dataset(self, problems: str | list[str] | None = None): - self.ds = datasets.load_dataset( - self.dataset_id, revision=self.dataset_revision - )[self.split] - dataset = {id: i for i, id in enumerate(self.ds["instance_id"])} - problems = filter_problems(dataset, problems) - dataset = {id: i for id, i in dataset.items() if id in problems} - - instance_ids = [self.ds[dataset[id]]["instance_id"] for id in dataset] - image_names = set( - f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in instance_ids - ) - - if not isinstance(self.terminal, KubernetesTerminal): - # Download all images needed for SWE-Bench. - client = docker.from_env() - tagged_image_names = set(f"swebench/{name}:latest" for name in image_names) - - existing_images = set( - tag for image in client.images.list() for tag in image.tags - ) - missing_images = tagged_image_names - existing_images - if missing_images: - self.logger.info(f"Found {len(missing_images)} missing Docker images.") - for i, image_name in enumerate(missing_images): - self.logger.info( - f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`." - ) - client.images.pull(image_name) - - return dataset + return self.task_data["problem_statement"] - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError( - f"Task `{task_name}` was not found in dataset. The available tasks are: {sorted(self.dataset)}.\n" - "Please provide a valid task or initialize the environment without problems to load all tasks." - ) + @property + def task_name(self) -> str: + return self.task_data["instance_id"] - self.task_name = task_name - self.ds_row = self.ds[self.dataset[self.task_name]] - self.repo = self.ds_row["repo"] + def setup_task(self): + self.repo = self.task_data["repo"] self.package_name = self.repo.split("/")[1] - self.version = self.ds_row["version"] + self.version = self.task_data["version"] self.install_configs = MAP_REPO_VERSION_TO_SPECS[self.repo][self.version] - self.gold_patch = self.ds_row["patch"] - self.test_spec = make_test_spec(self.ds_row) + self.gold_patch = self.task_data["patch"] + self.test_spec = make_test_spec(self.task_data) self.base_image = f"swebench/{self.test_spec.instance_image_key}".replace( "__", "_1776_" ) - self.base_commit = self.ds_row["base_commit"] - self.test_patch = self.ds_row["test_patch"] - self.fail_to_pass = json.loads(self.ds_row["FAIL_TO_PASS"]) - self.pass_to_pass = json.loads(self.ds_row["PASS_TO_PASS"]) + self.base_commit = self.task_data["base_commit"] + self.test_patch = self.task_data["test_patch"] + self.fail_to_pass = json.loads(self.task_data["FAIL_TO_PASS"]) + self.pass_to_pass = json.loads(self.task_data["PASS_TO_PASS"]) self.test_cmd = self.install_configs["test_cmd"] - self.test_directives = get_test_directives(self.ds_row) + self.test_directives = get_test_directives(self.task_data) self.entrypoint = " ".join([self.test_cmd, *self.test_directives]) if self.package_name == "sphinx" or self.package_name == "sympy": - # use pytest instead of `sympy bin/test` and `sphinx tox` so pdb breakpoints work - expression = " ".join(self.test_directives) - self.entrypoint = f"python -m pytest {expression}" - if self.entrypoint.startswith("PYTHONWARNINGS"): # Move PYTHONWARNINGS from the entrypoint to the session commands export, remaining = self.entrypoint.split(" ", 1) @@ -128,6 +82,11 @@ def setup_task(self, task_name: str, options: dict = None): # -q (quiet) with pytest avoids long pytest output self.debug_entrypoint = self.entrypoint.replace("pytest", "pytest -sq") + if self.package_name == "sphinx" or self.package_name == "sympy": + # use pytest instead of `sympy bin/test` and `sphinx tox` so pdb breakpoints work + expression = " ".join(self.test_directives) + self.debug_entrypoint = f"python -m pytest {expression}" + # --tb=short with pytest keeps the output concise self.entrypoint = self.entrypoint.replace("--tb=no", "--tb=short") @@ -165,6 +124,8 @@ def setup_terminal(self): self.terminal.run('echo "127.0.0.1 httpbin.org" >> /etc/hosts') elif self.task_name == "pylint-dev__pylint-4661": self.terminal.run("pip install appdirs==1.4.4") + elif self.package_name == "sphinx" or self.package_name == "sympy": + self.terminal.run("pip install pytest") # Apply any changes needed to the install commands. self.terminal.run("git config user.name 'debug-gym'") @@ -214,3 +175,49 @@ def calculate_score(self, eval_output: EvalOutput) -> int: ) assert score <= self.max_score return score + + @classmethod + def load_dataset( + cls, + dataset_id: str = "SWE-bench/SWE-bench_Verified", + dataset_revision: str = "99450355ca8c611021187a57ffac304b66666738", + split: str = "test", + problems: list | None = None, + prepull_images: bool = False, + logger: DebugGymLogger | None = None, + **kwargs, + ) -> dict: + ds = datasets.load_dataset(dataset_id, revision=dataset_revision)[split] + + # Memory efficient filtering of problems. + id2idx = {id: i for i, id in enumerate(ds["instance_id"])} + problems = filter_problems(id2idx, problems) + dataset = {problem: ds[id2idx[problem]] for problem in problems} + + # Add env_type to each task_data. + for task_data in dataset.values(): + task_data["env_type"] = "swebench" + + image_names = set( + f"sweb.eval.x86_64.{id.replace('__', '_1776_')}" for id in dataset + ) + + if prepull_images: + # Download all images needed for SWE-Bench. + client = docker.from_env() + tagged_image_names = set(f"swebench/{name}:latest" for name in image_names) + + existing_images = set( + tag for image in client.images.list() for tag in image.tags + ) + missing_images = tagged_image_names - existing_images + if missing_images: + if logger: + logger.info(f"Found {len(missing_images)} missing Docker images.") + for i, image_name in enumerate(missing_images): + if logger: + logger.info( + f"Pulling Docker images {i + 1}/{len(missing_images)}: `{image_name}`." + ) + client.images.pull(image_name) + return dataset diff --git a/debug_gym/gym/envs/swe_bench_debug.py b/debug_gym/gym/envs/swe_bench_debug.py index 19b6c356..2f8cd2f4 100644 --- a/debug_gym/gym/envs/swe_bench_debug.py +++ b/debug_gym/gym/envs/swe_bench_debug.py @@ -15,3 +15,13 @@ def eval(self, **kwargs) -> EvalOutput: success, output = self.terminal.run(self.entrypoint, timeout=self.run_timeout) self.last_eval = EvalOutput(success, output) return self.last_eval + + @classmethod + def load_dataset(cls, *args, **kwargs) -> dict: + dataset = SWEBenchEnv.load_dataset(*args, **kwargs) + + # Add env_type to each task_data. + for task_data in dataset.values(): + task_data["env_type"] = "swebench-debug" + + return dataset diff --git a/debug_gym/gym/envs/swe_smith.py b/debug_gym/gym/envs/swe_smith.py index ce7ef627..d71d3f5e 100644 --- a/debug_gym/gym/envs/swe_smith.py +++ b/debug_gym/gym/envs/swe_smith.py @@ -1,5 +1,6 @@ from importlib.resources import files as importlib_files from pathlib import Path +from typing import List import docker import yaml @@ -14,8 +15,7 @@ from debug_gym.constants import DEBUG_GYM_CACHE_DIR from debug_gym.gym.entities import EvalOutput from debug_gym.gym.envs.swe_bench import SWEBenchEnv -from debug_gym.gym.terminals.kubernetes import KubernetesTerminal -from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.gym.terminals.terminal import DebugGymLogger from debug_gym.gym.utils import filter_problems @@ -25,100 +25,20 @@ class SWESmithEnv(SWEBenchEnv): importlib_files("debug_gym") / "gym" / "envs" / "configs" / "swe_smith.yaml" ) - def __init__( - self, - dataset_id: str = "SWE-bench/SWE-smith", - dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232", - split: str = "train", - terminal: Terminal | None = None, - **kwargs, - ): - super().__init__( - dataset_id=dataset_id, - dataset_revision=dataset_revision, - split=split, - terminal=terminal, - **kwargs, - ) - - def load_dataset(self, problems: str | list[str] | None = None): - data_path = Path(self.dataset_id) - if data_path.is_file(): - # Loading from local file. - if data_path.suffix.lower() == ".json": - self.ds = load_dataset("json", data_files=self.dataset_id) - elif data_path.suffix.lower() == ".parquet": - self.ds = load_dataset("parquet", data_files=self.dataset_id) - elif data_path.is_dir(): - # Loading from local folder. - self.ds = load_from_disk(self.dataset_id) - else: - # Loading from HuggingFace or a folder. - self.ds = load_dataset(self.dataset_id, revision=self.dataset_revision) - - # Select the split. - self.ds = self.ds[self.split] - - # Load custom dataset splits from config. - with open(SWESmithEnv.CONFIG) as f: - custom_splits = yaml.safe_load(f) - excluded_ids = custom_splits.get("excluded", []) - - dataset = {id: i for i, id in enumerate(self.ds["instance_id"])} - problems = filter_problems(dataset, problems, custom_splits, excluded_ids) - dataset = {id: i for id, i in dataset.items() if id in problems} - - image_names = set(self.ds[dataset[id]]["image_name"] for id in dataset) - self.logger.debug( - f"Loaded {len(dataset)} tasks accross {len(image_names)} Docker images from {self.dataset_id}." - ) - - if not isinstance(self.terminal, KubernetesTerminal): - # Download all images needed for SWE-Smith. - client = docker.from_env() - tagged_image_names = set( - f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names - ) - - existing_images = set( - tag for image in client.images.list() for tag in image.tags - ) - missing_images = tagged_image_names - existing_images - if missing_images: - self.logger.info(f"Found {len(missing_images)} missing Docker images.") - for image_name in missing_images: - docker_hub_image = image_name.replace("__", "_1776_") - self.logger.info( - f"Pulling Docker image `{docker_hub_image}` to `{image_name}`." - ) - client.images.pull(docker_hub_image) - # Rename images via tagging - client.images.get(docker_hub_image).tag(image_name) - - return dataset - - def setup_task(self, task_name: str, options: dict = None): - if task_name not in self.dataset: - raise ValueError( - f"Task `{task_name}` was not found in dataset. The available tasks are: {sorted(self.dataset)}.\n" - "Please provide a valid task or initialize the environment without problems to load all tasks." - ) - - self.task_name = task_name - self.ds_row = self.ds[self.dataset[self.task_name]] + def setup_task(self): self.base_commit = ( - self.ds_row["base_commit"] if "base_commit" in self.ds_row else "main" + self.task_data["base_commit"] if "base_commit" in self.task_data else "main" ) - self.branch_name = self.ds_row["instance_id"] - self.bug_patch = self.ds_row["patch"] - self.image_name = self.ds_row["image_name"] + self.branch_name = self.task_data["instance_id"] + self.bug_patch = self.task_data["patch"] + self.image_name = self.task_data["image_name"] self.repo, self.commit = get_repo_commit_from_image_name(self.image_name) self.install_configs = MAP_REPO_TO_SPECS[self.repo][self.commit] self.base_image = f"{DOCKER_ORG}/{self.image_name}:{TAG}" self.package_name = self.repo.split("/")[1] - self.test_cmd, self.test_directives = get_test_command(self.ds_row) - self.fail_to_pass = self.ds_row["FAIL_TO_PASS"] - self.pass_to_pass = self.ds_row["PASS_TO_PASS"] + self.test_cmd, self.test_directives = get_test_command(self.task_data) + self.fail_to_pass = self.task_data["FAIL_TO_PASS"] + self.pass_to_pass = self.task_data["PASS_TO_PASS"] self.log_parser = MAP_REPO_TO_PARSER.get(self.repo, parse_log_pytest) if self.package_name == "python-colorlog": @@ -188,7 +108,9 @@ def setup_terminal(self): # Apply bug patch. self.terminal.run(f"git apply - <<'EOF'\n{self.bug_patch}\nEOF", raises=True) - self.terminal.run(f"git commit -am 'Applying bug patch for {self.task_name}'") + self.terminal.run( + f"git commit -am 'Applying bug patch for {self.task_name}' --no-verify" + ) def calculate_score(self, eval_output: EvalOutput) -> int: test_status_map = self.log_parser(eval_output.output) @@ -220,3 +142,75 @@ def eval(self, **kwargs) -> EvalOutput: success, output = self.terminal.run(self.entrypoint, timeout=self.run_timeout) self.last_eval = EvalOutput(success, output) return self.last_eval + + @classmethod + def load_dataset( + cls, + dataset_id: str = "SWE-bench/SWE-smith", + dataset_revision: str = "699b53400d3855206a0fbf3ff4beaf1a52f4f232", + split: str = "train", + problems: list | None = None, + prepull_images: bool = False, + logger: DebugGymLogger | None = None, + **kwargs, + ) -> dict: + logger = logger or DebugGymLogger("debug_gym") + data_path = Path(dataset_id) + if data_path.is_file(): + # Loading from local file. + if data_path.suffix.lower() == ".json": + ds = load_dataset("json", data_files=dataset_id) + elif data_path.suffix.lower() == ".parquet": + ds = load_dataset("parquet", data_files=dataset_id) + elif data_path.is_dir(): + # Loading from local folder. + ds = load_from_disk(dataset_id) + else: + # Loading from HuggingFace or a folder. + ds = load_dataset(dataset_id, revision=dataset_revision) + + # Select the split. + ds = ds[split] + + # Load custom dataset splits from config. + with open(SWESmithEnv.CONFIG) as f: + custom_splits = yaml.safe_load(f) + excluded_ids = custom_splits.get("excluded", []) + + # Memory efficient filtering of problems. + id2idx = {id: i for i, id in enumerate(ds["instance_id"])} + problems = filter_problems(id2idx, problems, custom_splits, excluded_ids) + dataset = {problem: ds[id2idx[problem]] for problem in problems} + + # Add env_type to each task_data. + for task_data in dataset.values(): + task_data["env_type"] = "swesmith" + + image_names = set(task_data["image_name"] for task_data in dataset.values()) + logger.debug( + f"Loaded {len(dataset)} tasks across {len(image_names)} Docker images from {dataset_id}." + ) + + if prepull_images: + # Download all images needed for SWE-Smith. + client = docker.from_env() + tagged_image_names = set( + f"{DOCKER_ORG}/{name}:{TAG}" for name in image_names + ) + + existing_images = set( + tag for image in client.images.list() for tag in image.tags + ) + missing_images = tagged_image_names - existing_images + if missing_images: + logger.info(f"Found {len(missing_images)} missing Docker images.") + + for image_name in missing_images: + docker_hub_image = image_name.replace("__", "_1776_") + logger.info( + f"Pulling Docker image `{docker_hub_image}` to `{image_name}`." + ) + client.images.pull(docker_hub_image) + # Rename images via tagging + client.images.get(docker_hub_image).tag(image_name) + return dataset diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index 5c5f39bc..7830d65b 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -1,4 +1,5 @@ import atexit +import hashlib import json import os import random @@ -37,6 +38,9 @@ def _clean_for_kubernetes(name: str) -> str: # replace any characters not in the regex with hyphens cleaned = "".join(c if c.isalnum() or c in "-." else "-" for c in name).lower() # ensure it starts and ends with alphanumeric character + cleaned = cleaned.replace("/", "-") + cleaned = cleaned.replace(":", "-") + cleaned = cleaned.replace(".", "-") cleaned = cleaned.strip("-").strip(".") # truncate to 253 characters return cleaned[:253] @@ -253,6 +257,9 @@ def __str__(self): class KubernetesTerminal(Terminal): + """ + Note: reads values of env variables K8S_NAMESPACE, K8S_DOCKER_SECRET, K8S_DOCKER_CONSTRAINT. + """ def __init__( self, @@ -264,8 +271,9 @@ def __init__( setup_commands: list[str] | None = None, pod_name: str | None = None, base_image: str | None = None, - registry: str = "", - namespace: str = "default", + image_pull_secret: str | None = None, + registry: str = "docker.io", + namespace: str | None = None, kube_config: str | None = None, kube_context: str | None = None, extra_labels: dict | None = None, @@ -282,7 +290,11 @@ def __init__( self.base_image = base_image self._task_name = base_image self.setup_commands = setup_commands or [] - self.namespace = namespace + self.namespace = namespace or os.environ.get("K8S_NAMESPACE", "default") + self.image_pull_secret = image_pull_secret or os.environ.get( + "K8S_DOCKER_SECRET" + ) + self.in_node_constraint = os.environ.get("K8S_IN_NODE_CONSTRAINT", False) self.kubernetes_kwargs = kwargs # e.g., nodeSelector, tolerations self.registry = registry.rstrip("/") + "/" if registry else "" self._pod_name = pod_name @@ -487,13 +499,37 @@ def setup_pod(self, max_retries: int = 3) -> None: for attempt in range(max_retries): # Generate a new pod name for each attempt to avoid sandbox conflicts pod_name = _clean_for_kubernetes( - self._pod_name or f"dbg-gym.{self.task_name}.{str(uuid.uuid4())[:8]}" + self._pod_name or f"dbg-gym-{self.task_name}-{str(uuid.uuid4())[:8]}" ) self.logger.debug( f"Setting up pod {pod_name} (attempt {attempt + 1}/{max_retries}) " f"with image: {self.registry}{self.base_image}" ) + # set image pull secrets, don't override imagePullSecrets + if self.image_pull_secret and not "imagePullSecrets" in pod_spec_kwargs: + pod_spec_kwargs["imagePullSecrets"] = [{"name": self.image_pull_secret}] + + # set in node constraint, don't override affinity + if self.in_node_constraint and not "affinity" in pod_spec_kwargs: + pod_spec_kwargs["affinity"] = { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "kubernetes.io/hostname", + "operator": "In", + "values": [os.environ["HOSTNAME"]], + } + ] + } + ] + } + } + } + # Create pod specification for Kubernetes. pod_body = { "apiVersion": "v1", diff --git a/debug_gym/gym/terminals/terminal.py b/debug_gym/gym/terminals/terminal.py index eaaa7264..90f4eb8a 100644 --- a/debug_gym/gym/terminals/terminal.py +++ b/debug_gym/gym/terminals/terminal.py @@ -32,8 +32,9 @@ def __init__( self._working_dir = working_dir self.sessions = [] + kwargs.pop("type", None) # remove 'type' if present if kwargs: - self.logger.warning(f"Ignoring unknown parameters: {kwargs}") + self.logger.debug(f"Ignoring unknown parameters: {kwargs}") @property def working_dir(self): diff --git a/debug_gym/gym/utils.py b/debug_gym/gym/utils.py index 24372a44..ee07c1c9 100644 --- a/debug_gym/gym/utils.py +++ b/debug_gym/gym/utils.py @@ -200,7 +200,7 @@ def filter_problems( problems: str | list[str] | None = None, custom_splits: dict[str, Any] | None = None, excluded_ids: list[str] | None = None, -) -> dict[str, Any]: +) -> list[str]: excluded_ids = excluded_ids or [] custom_splits = custom_splits or {} problems = "all" if problems is None else problems diff --git a/debug_gym/logger.py b/debug_gym/logger.py index 805c807f..386efe15 100644 --- a/debug_gym/logger.py +++ b/debug_gym/logger.py @@ -16,7 +16,14 @@ from rich.markup import escape from rich.padding import Padding from rich.panel import Panel -from rich.progress import BarColumn, Progress, SpinnerColumn, Task, TextColumn +from rich.progress import ( + BarColumn, + Progress, + SpinnerColumn, + Task, + TextColumn, + TimeElapsedColumn, +) from rich.table import Table from rich.text import Text @@ -205,6 +212,7 @@ def __init__( ScoreColumn(), BarColumn(bar_width=None), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TimeElapsedColumn(), expand=True, ) self._tasks_panel = Panel( @@ -307,20 +315,26 @@ def refresh_progress(self, all_tasks: bool = False): def _visible_tasks(self) -> Dict[str, Dict[str, Any]]: """Get visible tasks limited to the maximum display count, - showing pending/running tasks first, then completed tasks. + showing running tasks first, then pending, then failed ones, then completed tasks. Returns a dictionary mapping task IDs to their corresponding task data for visible tasks only.""" - # Get task IDs for pending, then completed tasks + # Get task IDs for running, pending, failed, and completed tasks + running = [] pending = [] + failed = [] completed = [] for tid, task in self._tasks.items(): - if task.completed: - completed.append(tid) - else: + if task.status == "running": + running.append(tid) + elif task.status == "pending": pending.append(tid) - # Limit to max_display tasks, showing pending first - visible_task_ids = (pending + completed)[: self.max_display] + elif task.status in ("error", "unresolved"): + failed.append(tid) + elif task.completed: + completed.append(tid) + # Limit to max_display tasks, showing running first, then pending, then failed, then completed + visible_task_ids = (running + pending + failed + completed)[: self.max_display] # Return the actual task data for the visible tasks return {tid: self._tasks[tid] for tid in visible_task_ids} @@ -377,6 +391,7 @@ def __init__( TextColumn("[progress.description]{task.description}"), BarColumn(bar_width=None), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TimeElapsedColumn(), expand=True, ) self.total = len(problems) @@ -505,6 +520,7 @@ class DebugGymLogger(logging.Logger): LOG_QUEUE = mp.Queue(maxsize=10000) PROGRESS_QUEUE = mp.Queue(maxsize=50000) # Increased from 10000 to 50000 _is_worker = False + _main_process_logger = None @classmethod def is_worker(cls): @@ -541,6 +557,8 @@ def __init__( self.propagate = False self.setLevel(level) # Set logger level, might be overridden by file handler + if DebugGymLogger._main_process_logger is not None: + self._is_worker = True # Placeholders for rich live, log listener thread, and stop event # Will be initialized if the logger is the main process logger @@ -550,6 +568,8 @@ def __init__( self._log_listener_thread = None # Thread to process logs from workers if self.is_main(): self._initialize_main_logger(level) + DebugGymLogger._main_process_logger = self + self.log_file = None # File handler for logging to a file self.log_dir = Path(log_dir) if log_dir else None if self.log_dir: # Directory to store log files diff --git a/scripts/config.yaml b/scripts/config.yaml index ee3952c5..62e0f075 100644 --- a/scripts/config.yaml +++ b/scripts/config.yaml @@ -1,16 +1,14 @@ base: # Environment configs output_path: "exps/pytorch" - env_kwargs: { - "path": "data/pytorch", - "entrypoint": "python -m pytest -sv test.py", - "debug_entrypoint": "python -m pdb -m pytest -s test.py", - "run_timeout": 10, - } - tools: ["pdb", "view", "rewrite"] - terminal: { - type: "docker", # "local", "docker", or "kubernetes" - } + env: + type: "local" + path: "data/pytorch" + entrypoint: "python -m pytest -sv test.py" + debug_entrypoint: "python -m pdb -m pytest -s test.py" + run_timeout: 10 + terminal: + type: "docker" # "local", "docker", or "kubernetes" # LLM configs llm_name: "gpt-4o" diff --git a/scripts/config_aider.yaml b/scripts/config_aider.yaml index 88dd68fb..53e53c8e 100644 --- a/scripts/config_aider.yaml +++ b/scripts/config_aider.yaml @@ -1,14 +1,16 @@ base: # Environment configs output_path: "exps/aider" - benchmark: "aider" - problems: "all" # list of problems, e.g., ["wordy"], or "all" - env_kwargs: { - "run_timeout": 20, - } - terminal: { - type: "docker", # "docker", "kubernetes", or "local" - } + + env: + run_timeout: 20 + + terminal: + type: "docker" # "docker", "kubernetes", or "local" + + dataset: + type: "aider" + problems: "all" # list of problems, e.g., ["wordy"], or "all" # LLM configs llm_name: "gpt-4o" diff --git a/scripts/config_mini_nightmare.yaml b/scripts/config_mini_nightmare.yaml index 88fbc08a..3be2f52f 100644 --- a/scripts/config_mini_nightmare.yaml +++ b/scripts/config_mini_nightmare.yaml @@ -1,16 +1,16 @@ base: # Environment configs output_path: "exps/mini_nightmare" - benchmark: "mini_nightmare" - problems: "all" # list of problems, e.g., ["config"], or "all" - env_kwargs: { - "run_timeout": 30, - # shortcut features - } - - terminal: { - type: "docker", # "docker", "kubernetes", or "local" - } + + env: + run_timeout: 30 + + terminal: + type: "docker" # "docker", "kubernetes", or "local" + + dataset: + type: "mini_nightmare" + problems: "all" # list of problems, e.g., ["config"], or "all" # LLM configs llm_name: "gpt-4o" diff --git a/scripts/config_r2egym.yaml b/scripts/config_r2egym.yaml index 8d14b79e..6ce57efe 100644 --- a/scripts/config_r2egym.yaml +++ b/scripts/config_r2egym.yaml @@ -1,16 +1,18 @@ base: # Environment configs output_path: "exps/re2gym" - benchmark: "r2egym" - problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all", - env_kwargs: { - "run_timeout": 300, - dataset_id: "R2E-Gym/R2E-Gym-Lite", + + env: + run_timeout: 300 + + dataset: + type: "r2egym" + problems: "all" # list of problems, e.g., ["aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"], or strings like "d1-valid" (defined in gym/envs/configs), or "all", + dataset_id: "R2E-Gym/R2E-Gym-Lite" dataset_revision: "8d3163011f01f9393bb3dc7700497a79a8686ae5" - } - terminal: { - type: "docker", # "docker", "kubernetes" - } + + terminal: + type: "docker" # "docker", "kubernetes" # LLM configs llm_name: "gpt-4o" @@ -61,5 +63,5 @@ solution_agent: llm_name: "human" # No need for an LLM. tools: - eval - - pdb + # - pdb - submit diff --git a/scripts/config_swebench.yaml b/scripts/config_swebench.yaml index 8bc0ba55..72405b4c 100644 --- a/scripts/config_swebench.yaml +++ b/scripts/config_swebench.yaml @@ -1,16 +1,18 @@ base: # Environment configs output_path: "exps/swebench-verified" - benchmark: "swebench-debug" - problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or "all" - env_kwargs: { - "run_timeout": 300, - "dataset_id": "SWE-bench/SWE-bench_Verified", - "dataset_revision": "99450355ca8c611021187a57ffac304b66666738", - } - terminal: { - type: "docker", # "docker", "kubernetes" - } + + env: + run_timeout: 300 + + dataset: + type: "swebench" + problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or "all" + dataset_id: "SWE-bench/SWE-bench_Verified" + dataset_revision: "99450355ca8c611021187a57ffac304b66666738" + + terminal: + type: "docker" # "docker" or "kubernetes" # LLM configs llm_name: "gpt-4o" @@ -29,34 +31,6 @@ base: "show_current_breakpoints": False # If True, the environment will automatically show the current breakpoints at every step in the system prompt. "show_directory_tree": 0 # Value indicated the depth of the directory shown in the system prompt. 0 means no directory tree is shown. -rewrite_agent: - tools: - - grep - - view - - rewrite - - listdir - - eval: - auto_eval_on_rewrite: False # If True, the environment will automatically call the Eval tool after a successful rewrite. If this is set to True, the agent does not need to call the Eval tool itself. - -debug_agent: - tools: - - grep - - pdb - - view - - rewrite - - listdir - - eval - -debug_5_agent: - n_rewrites_before_pdb: 5 - tools: - - grep - - pdb - - view - - rewrite - - listdir - - eval - solution_agent: llm_name: null # No need for an LLM. tools: @@ -65,7 +39,6 @@ solution_agent: - submit swe_agent: - benchmark: "swebench" max_steps: 100 max_rewrite_steps: 20 tools: diff --git a/scripts/config_swebench_debug.yaml b/scripts/config_swebench_debug.yaml new file mode 100644 index 00000000..25bc5f28 --- /dev/null +++ b/scripts/config_swebench_debug.yaml @@ -0,0 +1,67 @@ +base: + # Environment configs + output_path: "exps/swebench-verified-debugmode" + + env: + run_timeout: 300 + + dataset: + type: "swebench-debug" + problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or "all" + dataset_id: "SWE-bench/SWE-bench_Verified" + dataset_revision: "99450355ca8c611021187a57ffac304b66666738" + + terminal: + type: "docker" # "docker" or "kubernetes" + + # LLM configs + llm_name: "gpt-4o" + + # Agent configs + random_seed: 42 + max_steps: 50 + max_rewrite_steps: 10 + memory_size: 20 + save_patch: True + reset_prompt_history_after_rewrite: False + # Optionally loads a custom system prompt template from a file. + # system_prompt_template_file: "script/templates/system_prompt.jinja" + + # Shortcut features + "show_current_breakpoints": False # If True, the environment will automatically show the current breakpoints at every step in the system prompt. + "show_directory_tree": 0 # Value indicated the depth of the directory shown in the system prompt. 0 means no directory tree is shown. + +rewrite_agent: + tools: + - grep + - view + - rewrite + - listdir + - eval: + auto_eval_on_rewrite: False # If True, the environment will automatically call the Eval tool after a successful rewrite. If this is set to True, the agent does not need to call the Eval tool itself. + +debug_agent: + tools: + - grep + - pdb + - view + - rewrite + - listdir + - eval + +debug_5_agent: + n_rewrites_before_pdb: 5 + tools: + - grep + - pdb + - view + - rewrite + - listdir + - eval + +solution_agent: + llm_name: null # No need for an LLM. + tools: + - eval + - pdb + - submit diff --git a/scripts/config_swesmith.yaml b/scripts/config_swesmith.yaml index 5862e240..2c60679a 100644 --- a/scripts/config_swesmith.yaml +++ b/scripts/config_swesmith.yaml @@ -1,15 +1,18 @@ base: # Environment configs output_path: "exps/swesmith" - benchmark: "swesmith" - problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all", - env_kwargs: { - "run_timeout": 300, - "dataset_id": "SWE-bench/SWE-smith" - } - terminal: { - type: "docker", # "docker", "kubernetes" - } + + env: + # type: "swesmith" # Not needed Will be inferred from dataset. + run_timeout: 300 + + dataset: + type: "swesmith" + dataset_id: "SWE-bench/SWE-smith" + problems: "all" # list of problems, e.g., ["astropy__astropy-12907"], or strings like "test-125" (defined in gym/envs/configs), or "all", + + terminal: + type: "docker" # LLM configs llm_name: "gpt-4o" @@ -60,5 +63,5 @@ solution_agent: llm_name: null # No need for an LLM. tools: - eval - - pdb + # - pdb - submit diff --git a/scripts/run.py b/scripts/run.py index a39435cf..e1a3d398 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -11,7 +11,7 @@ from debug_gym import version as dg_version from debug_gym.agents.base_agent import AGENT_REGISTRY, AgentArgs, create_agent from debug_gym.agents.utils import load_config, save_patch, save_trajectory -from debug_gym.gym.envs import select_env +from debug_gym.gym.envs import load_dataset, select_env from debug_gym.gym.terminals import select_terminal from debug_gym.gym.tools.toolbox import Toolbox from debug_gym.llms.base import LLM @@ -40,7 +40,7 @@ def timeout_handler(signum, frame): signal.alarm(timeout_seconds) -def run_agent(args, problem, config): +def run_agent(args, task_name: str, task_data: dict, config: dict): set_signal(args.timeout) success = True env = None @@ -50,22 +50,22 @@ def run_agent(args, problem, config): report_progress_error = True exp_path = Path(config["output_path"]) / config["uuid"] - problem_path = exp_path / problem + task_path = exp_path / task_name task_logger = DebugGymLogger( - problem, - log_dir=problem_path, + task_name, + log_dir=task_path, level=args.logging_level, mode="w" if args.force_all else "a", ) try: - previous_run = load_previous_run_status(problem_path, problem) + previous_run = load_previous_run_status(task_path, task_name) if ( not args.force_all and previous_run is not None and previous_run.status in ["resolved", "unresolved"] ): - task_logger.debug(f"Previous run found: {problem_path}") + task_logger.debug(f"Previous run found: {task_path}") success = previous_run.status == "resolved" task_logger.debug(f"Previous run status: {previous_run.status}") if not args.force_failed or success: @@ -78,11 +78,11 @@ def run_agent(args, problem, config): max_score=previous_run.max_score, status=status, ) - task_logger.debug(f"Skipping {problem}, already done.") + task_logger.debug(f"Skipping {task_name}, already done.") return success task_logger.report_progress( - problem_id=problem, + problem_id=task_name, step=0, total_steps=1, score=0, @@ -90,7 +90,7 @@ def run_agent(args, problem, config): status="running", ) - env = create_env(config, task_logger) + env = create_env(config, task_data, task_logger) add_tools(env, config, task_logger) llm = LLM.instantiate( @@ -103,17 +103,16 @@ def run_agent(args, problem, config): agent = create_agent( config["agent_type"], agent_args=agent_args, - env=env, llm=llm, logger=task_logger, ) try: - success = agent.run(task_name=problem, debug=args.debug) + success = agent.run(env, debug=args.debug) except KeyboardInterrupt: task_logger.error("Agent run was interrupted by user.") task_logger.report_progress( - problem_id=problem, + problem_id=task_name, step=1, total_steps=1, score=0, @@ -124,11 +123,11 @@ def run_agent(args, problem, config): raise except AgentTimeoutException: task_logger.error( - f"Timeout: Problem `{problem}` exceeded " + f"Timeout: Problem `{task_name}` exceeded " f"the time limit of {args.timeout} seconds." ) task_logger.report_progress( - problem_id=problem, + problem_id=task_name, step=1, total_steps=1, score=0, @@ -142,23 +141,23 @@ def run_agent(args, problem, config): raise # save trajectory - save_trajectory(agent, problem, problem_path, task_logger) + save_trajectory(agent, task_name, task_path, task_logger) # optionally apply patch if config["save_patch"]: - save_patch(env, problem_path, task_logger) + save_patch(env, task_path, task_logger) except Exception as e: task_logger.error( - f"Task Error: {problem} - {e!r}. Run with --very-verbose " + f"Task Error: {task_name} - {e!r}. Run with --very-verbose " f"or check {task_logger.log_file} for more information." ) task_logger.debug( - f"Task {problem} generated an exception: {e!r}. Traceback: {traceback.format_exc()}" + f"Task {task_name} generated an exception: {e!r}. Traceback: {traceback.format_exc()}" ) if report_progress_error: task_logger.report_progress( - problem_id=problem, + problem_id=task_name, step=1, total_steps=1, score=0, @@ -177,14 +176,14 @@ def run_agent(args, problem, config): return success -def create_env(config: dict, logger: DebugGymLogger): +def create_env(config: dict, task_data: dict, logger: DebugGymLogger): terminal = select_terminal(config.get("terminal"), logger, uuid=config["uuid"]) - env_class = select_env(config.get("benchmark")) + env_class = select_env(task_data["env_type"]) env = env_class( - **config["env_kwargs"], - problems=config.get("problems", ["custom"]), + task_data=task_data, terminal=terminal, logger=logger, + **config.get("env", {}), ) return env @@ -248,9 +247,9 @@ def main(): logger.info(f"Experiment log path: {exp_output_path}") dump_experiment_info(config, args) - # Create the environment to get the list of problems to run. - env = create_env(config, logger=logger) - problems = sorted(env.dataset) + # Load the dataset based on the information found in the config. + dataset = load_dataset(config["dataset"], logger=logger) + problems = sorted(dataset) if args.list: print(f"\n# Available problems in {config.get('benchmark', 'config')}:") @@ -287,9 +286,9 @@ def main(): if num_workers == 1: # run sequentially for easier debugging for problem in problems: try: - success = run_agent(args, problem, config) + success = run_agent(args, problem, dataset[problem], config) except AgentTimeoutException: - pass # Handleled in run_agent, just continue + pass # Handled in run_agent, just continue except (KeyboardInterrupt, Exception) as e: raise e else: @@ -297,7 +296,9 @@ def main(): num_workers, initializer=DebugGymLogger.set_as_worker ) as executor: futures = { - executor.submit(run_agent, args, problem, config): problem + executor.submit( + run_agent, args, problem, dataset[problem], config + ): problem for problem in problems } for future in as_completed(futures): diff --git a/tests/agents/test_agents.py b/tests/agents/test_agents.py index 4a4af61e..aaac50b0 100644 --- a/tests/agents/test_agents.py +++ b/tests/agents/test_agents.py @@ -285,8 +285,8 @@ def test_create_agent(): class TestRegisteredAgent(BaseAgent): name = "test_registered" - def __init__(self, args, env, **kwargs): - super().__init__(args, env, **kwargs) + def __init__(self, agent_args, env, **kwargs): + super().__init__(agent_args, env, **kwargs) # Clear and setup registry original_registry = AGENT_REGISTRY.copy() diff --git a/tests/agents/test_utils.py b/tests/agents/test_utils.py index bb479a21..e3193d45 100644 --- a/tests/agents/test_utils.py +++ b/tests/agents/test_utils.py @@ -43,7 +43,7 @@ def test_load_config(): "--agent", "pdb_agent", "-p", - "base.random_seed=123", + "random_seed=123", "-v", "--debug", ], @@ -69,8 +69,7 @@ def test_load_config(): "--agent", "rewrite_only", "-p", - "base.random_seed=123", - "rewrite_only.random_seed=456", + "random_seed=456", "-v", "--debug", ], diff --git a/tests/gym/envs/conftest.py b/tests/gym/envs/conftest.py index 1d4056ce..1c7842f3 100644 --- a/tests/gym/envs/conftest.py +++ b/tests/gym/envs/conftest.py @@ -1,3 +1,5 @@ +import logging + import pytest from filelock import FileLock @@ -31,7 +33,12 @@ def make_env_factory(env_name, worker_id, tmp_path_factory): env_class = kwargs.pop("env_class") def _make_env(): - return env_class(**kwargs) + dataset = env_class.load_dataset( + problems=kwargs["problems"], prepull_images=True + ) + task_data = next(iter(dataset.values())) + env = env_class(task_data=task_data) + return env if worker_id == "master": # Not running with pytest-xdist or we are in the master process diff --git a/tests/gym/envs/test_aider.py b/tests/gym/envs/test_aider.py index 8786e291..ed2a2ac6 100644 --- a/tests/gym/envs/test_aider.py +++ b/tests/gym/envs/test_aider.py @@ -37,8 +37,10 @@ def setup_aider_repo(tmp_path_factory): @pytest.fixture def env(setup_aider_repo): terminal = LocalTerminal() - env = AiderBenchmarkEnv(terminal=terminal) - env.reset(options={"task_name": "clock"}) + dataset = AiderBenchmarkEnv.load_dataset() + task_data = dataset["clock"] + env = AiderBenchmarkEnv(task_data=task_data, terminal=terminal) + env.reset() return env @@ -103,13 +105,15 @@ def test_instructions(env): @patch("debug_gym.gym.envs.aider.build_docker_image") def test_build_docker_image(mock_build_docker_image): - AiderBenchmarkEnv() + dataset = AiderBenchmarkEnv.load_dataset() mock_build_docker_image.assert_called_once() @pytest.if_docker_running def test_reset_with_docker_terminal(setup_aider_repo): - env = AiderBenchmarkEnv() + dataset = AiderBenchmarkEnv.load_dataset() + task_data = dataset["clock"] + env = AiderBenchmarkEnv(task_data=task_data) env.add_tool(Toolbox.get_tool("eval")) assert isinstance(env.terminal, DockerTerminal) diff --git a/tests/gym/envs/test_env.py b/tests/gym/envs/test_env.py index 1a77a1ed..6a036893 100644 --- a/tests/gym/envs/test_env.py +++ b/tests/gym/envs/test_env.py @@ -6,13 +6,14 @@ from debug_gym.gym.entities import EvalOutput, Event, Observation from debug_gym.gym.envs.env import EnvInfo, EventHooks, RepoEnv, TooledEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @pytest.fixture -def env_mock(): - env = RepoEnv() +def env_mock(tmp_path): + env = LocalEnv(path=tmp_path) return env @@ -109,7 +110,7 @@ def test_tool_names(env_mock): assert env_mock.tool_names == "tool1, tool2" -def test_env_tools(): +def test_env_tools(env_mock): tool1 = MagicMock() tool1.name = "tool1" tool1.description = "instructions1" @@ -129,11 +130,10 @@ def test_env_tools(): }, } - env = RepoEnv() - env.add_tool(tool1) - env.add_tool(tool2) + env_mock.add_tool(tool1) + env_mock.add_tool(tool2) - assert env.tools == [tool1, tool2] + assert env_mock.tools == [tool1, tool2] @pytest.fixture @@ -147,7 +147,7 @@ def env(tmp_path): (repo_path / "file2.txt").touch() (subdir_path / "subfile1.txt").touch() - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) return env @@ -186,7 +186,7 @@ def test_step( mock_pdb_tool.current_frame_file = "file.py" mock_get_tool.return_value = None - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() env.last_eval = EvalOutput(success=False, output="1 failed, 0 passed") tool_call = ToolCall(id="123", name="pdb", arguments={"command": "b 10"}) @@ -210,7 +210,7 @@ def test_reset(tmp_path): (tmp_path / "test.py").write_text("def test_1():\n assert False\n") (tmp_path / ".debugignore").write_text("__pycache__/\n.git/\n.pytest_cache/\n") - env = RepoEnv(path=tmp_path, entrypoint="pytest test.py") + env = LocalEnv(path=tmp_path, entrypoint="pytest test.py") infos = env.reset() assert env.last_eval is None @@ -224,7 +224,7 @@ def test_reset(tmp_path): action_reasoning=None, action_content=None, action_tool_call=None, - instructions="", + instructions=env.instructions, score=0, max_score=None, terminated=False, @@ -276,7 +276,7 @@ def test_eval(tmp_path): (tmp_path / "test.py").write_text("def test_1():\n assert False\n") (tmp_path / ".debugignore").write_text("__pycache__/\n.git/\n.pytest_cache/\n") - env = RepoEnv(path=tmp_path, entrypoint="pytest test.py") + env = LocalEnv(path=tmp_path, entrypoint="pytest test.py") env.reset() env.eval() assert "FAILED test.py::test_1 - assert False" in env.last_eval.output @@ -287,7 +287,7 @@ def test_eval_success(tmp_path): # create a dummy file with open(tmp_path / "file.py", "w") as f: f.write("print('Hello, World!')") - env = RepoEnv(path=working_dir, entrypoint="python file.py") + env = LocalEnv(path=working_dir, entrypoint="python file.py") env.reset() output = env.eval() assert output == EvalOutput(success=True, output="Hello, World!") @@ -298,7 +298,7 @@ def test_eval_timeout(tmp_path): # runs for longer than the timeout with open(tmp_path / "file.py", "w") as f: f.write("import time; time.sleep(5)") - env = RepoEnv(path=working_dir, entrypoint="python file.py", run_timeout=1) + env = LocalEnv(path=working_dir, entrypoint="python file.py", run_timeout=1) env.reset() output = env.eval() assert output == EvalOutput(success=False, output="Timeout expired.") @@ -371,22 +371,20 @@ def test_event_hooks_notify(): subscriber.on_env_start.assert_called_once() -def test_current_breakpoints_no_breakpoints(): - env = RepoEnv() - env.current_breakpoints_state = {} - result = env.current_breakpoints() +def test_current_breakpoints_no_breakpoints(env_mock): + env_mock.current_breakpoints_state = {} + result = env_mock.current_breakpoints() assert result == "No breakpoints are set." -def test_current_breakpoints_with_breakpoints(tmp_path): - env = RepoEnv() - env.current_breakpoints_state = { +def test_current_breakpoints_with_breakpoints(tmp_path, env_mock): + env_mock.current_breakpoints_state = { "file1.py|||10": "b file1.py:10", "file1.py|||20": "b file1.py:20", "file1.py|||30": "b file1.py:30", "file2.py|||15": "b file2.py:15", } - result = env.current_breakpoints() + result = env_mock.current_breakpoints() expected_result = ( "line 10 in file1.py\n" "line 20 in file1.py\n" @@ -424,7 +422,7 @@ def test_queue_and_process_events(): def test_has_breakpoint_true_and_false(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() file_path = env.working_dir / "test.py" file_path.write_text("print('hello')") @@ -438,7 +436,7 @@ def test_has_breakpoint_true_and_false(tmp_path): def test_has_breakpoint_relative_path(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() file_path = env.working_dir / "foo.py" file_path.write_text("print('foo')") diff --git a/tests/gym/envs/test_mini_nightmare.py b/tests/gym/envs/test_mini_nightmare.py index eee46ee4..0a8590c5 100644 --- a/tests/gym/envs/test_mini_nightmare.py +++ b/tests/gym/envs/test_mini_nightmare.py @@ -12,23 +12,23 @@ def mini_nightmare_env(): # Initialize the MiniNightmareEnv with LocalTerminal terminal = LocalTerminal() - env = MiniNightmareEnv(terminal=terminal) + dataset = MiniNightmareEnv.load_dataset() + task_data = dataset["config"] + env = MiniNightmareEnv(task_data=task_data, terminal=terminal) env.add_tool(Toolbox.get_tool("eval")) return env def test_load_dataset(mini_nightmare_env): - dataset = mini_nightmare_env.load_dataset() - assert mini_nightmare_env.dataset == dataset - + dataset = MiniNightmareEnv.load_dataset() subproblems = list(dataset.keys())[::2] - subset = mini_nightmare_env.load_dataset(problems=subproblems) + subset = MiniNightmareEnv.load_dataset(problems=subproblems) assert list(subset.keys()) == subproblems @patch("debug_gym.gym.envs.mini_nightmare.build_docker_image") def test_build_docker_image(mock_build_docker_image): - MiniNightmareEnv() + dataset = MiniNightmareEnv.load_dataset() mock_build_docker_image.assert_called_once() @@ -53,11 +53,13 @@ def test_reset(mini_nightmare_env): @pytest.if_docker_running def test_reset_with_docker_terminal(): - env = MiniNightmareEnv() + dataset = MiniNightmareEnv.load_dataset() + task_data = dataset["config"] + env = MiniNightmareEnv(task_data=task_data) env.add_tool(Toolbox.get_tool("eval")) assert isinstance(env.terminal, DockerTerminal) - infos = env.reset(options={"task_name": "config"}) + infos = env.reset() assert env.instructions == infos.step_observation.observation assert "2 failed" in infos.eval_observation.observation assert infos.max_score == 2 diff --git a/tests/gym/envs/test_r2egym.py b/tests/gym/envs/test_r2egym.py index d2d2e92c..fdea0096 100644 --- a/tests/gym/envs/test_r2egym.py +++ b/tests/gym/envs/test_r2egym.py @@ -1,4 +1,3 @@ -from pathlib import Path from unittest.mock import MagicMock, patch import pyarrow as pa @@ -8,7 +7,6 @@ from debug_gym.agents.solution_agent import AgentSolution from debug_gym.gym.entities import Observation from debug_gym.gym.envs.r2egym import R2EGymEnv -from debug_gym.gym.terminals.docker import DockerTerminal from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -16,14 +14,20 @@ @pytest.if_docker_running def test_load_dataset(get_r2egym_env): env = get_r2egym_env() - assert env.dataset_id == "R2E-Gym/R2E-Gym-Lite" - # check if the dataset contains features that R2EGymEnv expects - assert sorted(env.ds.features.keys()) == sorted( + + task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" + dataset = env.load_dataset(problems=[task_name]) + assert task_name in dataset + + task_data = next(iter(dataset.values())) + assert sorted(task_data.keys()) == sorted( [ "commit_hash", + "env_type", "docker_image", "execution_result_content", "expected_output_json", + "instance_id", "modified_entity_summaries", "modified_files", "num_non_test_files", @@ -38,20 +42,15 @@ def test_load_dataset(get_r2egym_env): ) -@patch("docker.from_env") -def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path): +def test_load_dataset_from_parquet(tmp_path): """Test loading R2EGym dataset from a local Parquet file.""" - # Mock Docker client to avoid trying to pull images - mock_docker_client = MagicMock() - mock_docker_client.images.list.return_value = [] - mock_docker_from_env.return_value = mock_docker_client # Create a minimal test Parquet file with expected schema parquet_file = tmp_path / "test_dataset.parquet" - + docker_image = "test_repo:test_hash_123" data = { "commit_hash": ["test_hash_123"], - "docker_image": ["test_repo:test_hash_123"], + "docker_image": [docker_image], "execution_result_content": ["test execution result"], "expected_output_json": ['{"test": "output"}'], "modified_entity_summaries": ["test summaries"], @@ -69,19 +68,19 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path): table = pa.table(data) pq.write_table(table, str(parquet_file)) - # Mock the terminal to avoid actual Docker operations - mock_terminal = MagicMock(spec=DockerTerminal) - # Load the dataset from the Parquet file - env = R2EGymEnv(dataset_id=str(parquet_file), split="train", terminal=mock_terminal) + dataset = R2EGymEnv.load_dataset(dataset_id=str(parquet_file), split="train") + dataset_entry = next(iter(dataset.values())) # Verify the dataset contains the expected features - assert sorted(env.ds.features.keys()) == sorted( + assert sorted(dataset_entry) == sorted( [ "commit_hash", + "env_type", "docker_image", "execution_result_content", "expected_output_json", + "instance_id", "modified_entity_summaries", "modified_files", "num_non_test_files", @@ -96,26 +95,26 @@ def test_load_dataset_from_parquet(mock_docker_from_env, tmp_path): ) # Verify the dataset has the expected data - assert len(env.ds) == 1 - assert env.ds[0]["docker_image"] == "test_repo:test_hash_123" - assert env.ds[0]["commit_hash"] == "test_hash_123" - assert "Test problem statement" in env.ds[0]["problem_statement"] + assert len(dataset) == 1 + task_name = docker_image # For R2EGym, we use docker_image as instance_id + assert docker_image in dataset + assert dataset[task_name]["docker_image"] == "test_repo:test_hash_123" + assert dataset[task_name]["commit_hash"] == "test_hash_123" + assert "Test problem statement" in dataset[task_name]["problem_statement"] @pytest.if_docker_running def test_instructions(get_r2egym_env): env = get_r2egym_env() - env.setup_task("aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324") # Instructions might be wrapped by [ISSUE] [/ISSUE] - assert env.instructions in env.ds_row["problem_statement"] + assert env.instructions in env.task_data["problem_statement"] @pytest.if_docker_running def test_setup_task(get_r2egym_env): env = get_r2egym_env() - task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" - env.setup_task(task_name) - assert env.task_name == task_name + assert env.task_name == "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" + env.setup_task() assert ( env.base_image == "namanjain12/aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" @@ -128,8 +127,7 @@ def test_setup_task(get_r2egym_env): @pytest.if_docker_running def test_setup_terminal(get_r2egym_env): env = get_r2egym_env() - task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" - env.reset(options={"task_name": task_name}) + env.reset() _, output = env.terminal.run(f"ls -a") assert ".git" in output assert "r2e_tests" in output @@ -140,9 +138,7 @@ def test_setup_terminal(get_r2egym_env): def test_reset_and_step(get_r2egym_env): env = get_r2egym_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset( - options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} - ) + env_info = env.reset() assert env.instructions == env_info.step_observation.observation assert "short test summary info" in env_info.eval_observation.observation @@ -197,9 +193,7 @@ def test_reset_and_step(get_r2egym_env): @pytest.if_docker_running def test_readonly_file(get_r2egym_env): env = get_r2egym_env() - env_info = env.reset( - options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} - ) + env_info = env.reset() assert env.workspace._is_readonly_func("/testbed/r2e_tests/test_1.py") env.add_tool(Toolbox.get_tool("view")) @@ -229,10 +223,7 @@ def test_readonly_file(get_r2egym_env): def test_apply_gold_patch(get_r2egym_env): env = get_r2egym_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset( - options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} - ) - + env_info = env.reset() assert not env_info.terminated assert not env_info.resolved assert env_info.score == env.score == 0 @@ -247,19 +238,17 @@ def test_apply_gold_patch(get_r2egym_env): def test_running_solution_agent(get_r2egym_env, tmp_path): """End-to-end SolutionAgent run for R2E-Gym environment, asserting successful resolution after gold patch.""" env = get_r2egym_env() - task_name = "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324" config = { "output_path": str(tmp_path), "random_seed": 0, "memory_size": 8, "max_steps": 1, "max_rewrite_steps": 1, - "env_kwargs": {}, } for tool_name in ["pdb", "eval", "submit"]: env.add_tool(Toolbox.get_tool(tool_name)) agent = AgentSolution(agent_args=config, llm=None, logger=env.logger) - env.reset(options={"task_name": task_name}) + env.reset() success = agent.run(env) assert success @@ -268,9 +257,7 @@ def test_running_solution_agent(get_r2egym_env, tmp_path): def test_debug_entrypoint_contains_pdb(get_r2egym_env): """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging.""" env = get_r2egym_env() - env.reset( - options={"task_name": "aiohttp_final:d7cd0613472fd4d9940e37f1c55921f6a1515324"} - ) + env.reset() assert ( "python -m pdb" in env.debug_entrypoint ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}" diff --git a/tests/gym/envs/test_swe_bench.py b/tests/gym/envs/test_swe_bench.py index c8f86cb4..4d787cd3 100644 --- a/tests/gym/envs/test_swe_bench.py +++ b/tests/gym/envs/test_swe_bench.py @@ -10,16 +10,14 @@ @pytest.if_docker_running def test_instructions(get_swe_bench_env): env = get_swe_bench_env() - env.ds_row = {"problem_statement": "Test problem statement"} - expected_instructions = "Test problem statement" - assert env.instructions == expected_instructions + assert env.instructions == env.task_data["problem_statement"] @pytest.if_docker_running def test_reset_and_step(get_swe_bench_env): env = get_swe_bench_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset(options={"task_name": "astropy__astropy-14096"}) + env_info = env.reset() assert env.instructions == env_info.step_observation.observation assert "short test summary info" in env_info.eval_observation.observation @@ -99,46 +97,52 @@ def test_readonly_file(get_swe_bench_env): assert "|-- test_sky_coord.py (read-only)" in env_info.step_observation.observation -@pytest.if_docker_running def test_load_dataset(get_swe_bench_env): env = get_swe_bench_env() - assert env.dataset_id == "SWE-bench/SWE-bench_Verified" + + dataset = env.load_dataset() task_name = "astropy__astropy-14096" - assert task_name in env.dataset.keys() - assert list(env.ds.features.keys()) == [ - "repo", - "instance_id", - "base_commit", - "patch", - "test_patch", - "problem_statement", - "hints_text", - "created_at", - "version", - "FAIL_TO_PASS", - "PASS_TO_PASS", - "environment_setup_commit", - "difficulty", - ] + assert task_name in dataset + + task_data = next(iter(dataset.values())) + assert sorted(task_data.keys()) == sorted( + [ + "repo", + "env_type", + "instance_id", + "base_commit", + "patch", + "test_patch", + "problem_statement", + "hints_text", + "created_at", + "version", + "FAIL_TO_PASS", + "PASS_TO_PASS", + "environment_setup_commit", + "difficulty", + ] + ) -@pytest.if_docker_running def test_setup_task(get_swe_bench_env): env = get_swe_bench_env() task_name = "astropy__astropy-14096" - env.setup_task(task_name) assert env.task_name == task_name - assert env.ds_row["repo"] == "astropy/astropy" - assert env.ds_row["version"] == "5.1" - assert isinstance(env.ds_row, dict) - assert isinstance(env.install_configs, dict) + env.setup_task() + assert env.repo == "astropy/astropy" + assert env.version == "5.1" + assert env.package_name == "astropy" + assert ( + env.base_image == "swebench/sweb.eval.x86_64.astropy_1776_astropy-14096:latest" + ) @pytest.if_docker_running def test_setup_terminal(get_swe_bench_env): env = get_swe_bench_env() task_name = "astropy__astropy-14096" - env.reset(options={"task_name": task_name}) + env.reset() _, git_logs = env.terminal.run("git log -n 4") assert env.base_commit in git_logs assert f"Applying test patch for {task_name}" not in git_logs @@ -167,7 +171,7 @@ def test_patch_property(tmp_path, get_swe_bench_env): env = get_swe_bench_env() # Reset with a task to set up the environment - env.reset(options={"task_name": "astropy__astropy-14096"}) + env.reset() # Initially, there should be no changes (empty patch) initial_patch = env.patch @@ -218,7 +222,7 @@ def new_function(): def test_apply_gold_patch(get_swe_bench_env): env = get_swe_bench_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset(options={"task_name": "astropy__astropy-14096"}) + env_info = env.reset() assert not env_info.terminated assert not env_info.resolved @@ -242,12 +246,11 @@ def test_running_solution_agent(get_swe_bench_env, tmp_path): # Optional values that BaseAgent.run would use; harmless to include here. "max_steps": 1, "max_rewrite_steps": 1, - "env_kwargs": {}, } for tool_name in ["pdb", "submit"]: env.add_tool(Toolbox.get_tool(tool_name)) agent = AgentSolution(agent_args=config, llm=None, logger=env.logger) - env.reset(options={"task_name": "astropy__astropy-14096"}) + env.reset() success = agent.run(env) assert success @@ -256,7 +259,7 @@ def test_running_solution_agent(get_swe_bench_env, tmp_path): def test_debug_entrypoint_contains_pdb(get_swe_bench_env): """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging.""" env = get_swe_bench_env() - env.reset(options={"task_name": "astropy__astropy-14096"}) + env.reset() assert ( "python -m pdb" in env.debug_entrypoint ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}" @@ -266,7 +269,7 @@ def test_debug_entrypoint_contains_pdb(get_swe_bench_env): def test_setup_terminal_debug_mode(get_swe_bench_debug_env): env = get_swe_bench_debug_env() task_name = "astropy__astropy-14096" - env.reset(options={"task_name": task_name}) + env.reset() _, git_logs = env.terminal.run("git log -n 4") assert env.base_commit in git_logs assert f"Applying test patch for {task_name}" in git_logs @@ -287,11 +290,10 @@ def test_running_solution_agent_in_debug_mode(get_swe_bench_debug_env, tmp_path) # Optional values that BaseAgent.run would use; harmless to include here. "max_steps": 1, "max_rewrite_steps": 1, - "env_kwargs": {}, } for tool_name in ["pdb", "eval", "submit"]: env.add_tool(Toolbox.get_tool(tool_name)) agent = AgentSolution(agent_args=config, llm=None, logger=env.logger) - env.reset(options={"task_name": "astropy__astropy-14096"}) + env.reset() success = agent.run(env) assert success diff --git a/tests/gym/envs/test_swe_smith.py b/tests/gym/envs/test_swe_smith.py index 8c46befc..1a641ca9 100644 --- a/tests/gym/envs/test_swe_smith.py +++ b/tests/gym/envs/test_swe_smith.py @@ -1,11 +1,13 @@ from pathlib import Path import datasets +import pyarrow as pa +import pyarrow.parquet as pq import pytest from debug_gym.agents.solution_agent import AgentSolution from debug_gym.gym.entities import Observation -from debug_gym.gym.envs import SWESmithEnv +from debug_gym.gym.envs.swe_smith import SWESmithEnv from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -13,11 +15,17 @@ @pytest.if_docker_running def test_load_dataset(get_swe_smith_env): env = get_swe_smith_env() - assert env.dataset_id == "SWE-bench/SWE-smith" + + task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" + dataset = env.load_dataset(problems=[task_name]) + assert task_name in dataset + # check if the dataset contains features that SWESmithEnv expects - assert sorted(env.ds.features.keys()) == sorted( + task_data = next(iter(dataset.values())) + assert sorted(task_data.keys()) == sorted( [ "instance_id", + "env_type", "repo", "patch", "FAIL_TO_PASS", @@ -32,8 +40,9 @@ def test_load_dataset(get_swe_smith_env): def test_load_dataset_from_parquet(tmp_path): """Test that loading from a local Parquet file works correctly.""" + # Create a sample parquet file with the required features - sample_data = { + data = { "instance_id": ["test-instance-1", "test-instance-2"], "repo": ["test/repo1", "test/repo2"], "patch": ["diff --git a/file.py", "diff --git b/file2.py"], @@ -44,20 +53,20 @@ def test_load_dataset_from_parquet(tmp_path): "base_commit": ["abc123", "def456"], "problem_statement": ["Problem 1", "Problem 2"], } + parquet_file = tmp_path / "test_dataset.parquet" - # Create a dataset and save as parquet - ds = datasets.Dataset.from_dict(sample_data) - parquet_path = tmp_path / "test_dataset.parquet" - ds.to_parquet(str(parquet_path)) + table = pa.table(data) + pq.write_table(table, str(parquet_file)) - # Test that the parquet file can be loaded using datasets library - # mimicking what SWESmithEnv.load_dataset() does for parquet files - loaded_ds = datasets.load_dataset("parquet", data_files=str(parquet_path))["train"] + # Load the dataset from the Parquet file + dataset = SWESmithEnv.load_dataset(dataset_id=str(parquet_file), split="train") + dataset_entry = next(iter(dataset.values())) # Verify that the dataset was loaded correctly with expected features - assert sorted(loaded_ds.features.keys()) == sorted( + assert sorted(dataset_entry.keys()) == sorted( [ "instance_id", + "env_type", "repo", "patch", "FAIL_TO_PASS", @@ -69,25 +78,20 @@ def test_load_dataset_from_parquet(tmp_path): ] ) # Verify that the data is accessible - assert len(loaded_ds) == 2 - assert loaded_ds[0]["instance_id"] == "test-instance-1" - assert loaded_ds[1]["instance_id"] == "test-instance-2" + assert len(dataset) == 2 + assert sorted(dataset.keys()) == ["test-instance-1", "test-instance-2"] -@pytest.if_docker_running def test_instructions(get_swe_smith_env): env = get_swe_smith_env() - env.ds_row = {"problem_statement": "Test problem statement"} - expected_instructions = "Test problem statement" - assert env.instructions == expected_instructions + assert env.instructions == env.task_data["problem_statement"] -@pytest.if_docker_running def test_setup_task(get_swe_smith_env): env = get_swe_smith_env() task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - env.setup_task(task_name) assert env.task_name == task_name + env.setup_task() assert env.repo == "john-kurkowski/tldextract" assert env.branch_name == task_name assert env.package_name == "tldextract" @@ -97,7 +101,7 @@ def test_setup_task(get_swe_smith_env): def test_setup_terminal(get_swe_smith_env): env = get_swe_smith_env() task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - env.reset(options={"task_name": task_name}) + env.reset() _, git_logs = env.terminal.run("git log -n 4") # For SWE-Smith the base commit is found in the branch associated to the # instance id and is different from the one in the main branch. @@ -112,11 +116,7 @@ def test_setup_terminal(get_swe_smith_env): def test_reset_and_step(get_swe_smith_env): env = get_swe_smith_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset( - options={ - "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - } - ) + env_info = env.reset() assert env.instructions == env_info.step_observation.observation assert "short test summary info" in env_info.eval_observation.observation @@ -156,11 +156,7 @@ def test_reset_and_step(get_swe_smith_env): @pytest.if_docker_running def test_readonly_file(get_swe_smith_env): env = get_swe_smith_env() - env_info = env.reset( - options={ - "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - } - ) + env_info = env.reset() env.add_tool(Toolbox.get_tool("view")) env.add_tool(Toolbox.get_tool("listdir")) @@ -199,11 +195,7 @@ def test_readonly_file(get_swe_smith_env): def test_apply_gold_patch(get_swe_smith_env): env = get_swe_smith_env() env.add_tool(Toolbox.get_tool("eval")) - env_info = env.reset( - options={ - "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - } - ) + env_info = env.reset() assert not env_info.terminated assert not env_info.resolved @@ -220,8 +212,7 @@ def test_calculate_score_with_pytest_error(get_swe_smith_env): """Test that the indentation error in pytest is handled correctly.""" env = get_swe_smith_env() env.add_tool(Toolbox.get_tool("eval")) - task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - env.reset(options={"task_name": task_name}) + env.reset() # Modify 'tldextract/tldextract.py' in the working_dir to introduce an indentation error. content = env.workspace.read_file("tldextract/tldextract.py").split("\n") @@ -253,19 +244,18 @@ def test_calculate_score_with_pytest_error(get_swe_smith_env): def test_running_solution_agent(get_swe_smith_env, tmp_path): """Analogous to SWE Bench solution agent test: run SolutionAgent end-to-end and assert success.""" env = get_swe_smith_env() - task_name = "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" config = { "output_path": str(tmp_path), "random_seed": 0, "memory_size": 8, "max_steps": 1, "max_rewrite_steps": 1, - "env_kwargs": {}, + "env": env, } for tool_name in ["pdb", "eval", "submit"]: env.add_tool(Toolbox.get_tool(tool_name)) agent = AgentSolution(agent_args=config, llm=None, logger=env.logger) - env.reset(options={"task_name": task_name}) + env.reset() success = agent.run(env) assert success @@ -274,11 +264,7 @@ def test_running_solution_agent(get_swe_smith_env, tmp_path): def test_debug_entrypoint_contains_pdb(get_swe_smith_env): """Ensure the environment's debug_entrypoint includes '-m pdb' for interactive debugging.""" env = get_swe_smith_env() - env.reset( - options={ - "task_name": "john-kurkowski__tldextract.3d1bf184.combine_file__1vnuqpt4" - } - ) + env.reset() assert ( "python -m pdb" in env.debug_entrypoint ), f"Expected '-m pdb' in debug_entrypoint, got: {env.debug_entrypoint}" diff --git a/tests/gym/terminals/test_kubernetes.py b/tests/gym/terminals/test_kubernetes.py index 0161fbcc..dcde0a56 100644 --- a/tests/gym/terminals/test_kubernetes.py +++ b/tests/gym/terminals/test_kubernetes.py @@ -70,7 +70,7 @@ def test_kubernetes_terminal_init(): assert terminal._pod is not None # Pod name should be automatically generated when not provided at initialization. - assert terminal.pod_name.startswith("dbg-gym.") + assert terminal.pod_name.startswith("dbg-gym-") assert terminal.pod.is_running() assert terminal.pod.exists() diff --git a/tests/gym/test_utils.py b/tests/gym/test_utils.py index 6a51b583..47f50335 100644 --- a/tests/gym/test_utils.py +++ b/tests/gym/test_utils.py @@ -2,7 +2,7 @@ import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.utils import ( _walk, cleanup_pytest_output, @@ -45,7 +45,7 @@ def test_show_line_number_no_code_path_no_breakpoints(): def test_show_line_number_with_code_path(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() code_path = f"{env.working_dir}/code.py" breakpoints_state = {f"{code_path}|||2": "b 2"} @@ -65,7 +65,7 @@ def test_show_line_number_with_code_path(tmp_path): def test_show_line_number_multiple_breakpoints(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() code_path = f"{env.working_dir}/code.py" breakpoints_state = { @@ -92,7 +92,7 @@ def test_show_line_number_multiple_breakpoints(tmp_path): def test_show_line_number_multiple_breakpoints_with_start_index(tmp_path): - env = RepoEnv(path=tmp_path) + env = LocalEnv(path=tmp_path) env.reset() code_path = f"{env.working_dir}/code.py" breakpoints_state = { diff --git a/tests/gym/tools/test_bash.py b/tests/gym/tools/test_bash.py index 5e7d860e..5644066a 100644 --- a/tests/gym/tools/test_bash.py +++ b/tests/gym/tools/test_bash.py @@ -4,7 +4,7 @@ import pytest from debug_gym.gym.entities import Observation -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.bash import BashTool from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -30,7 +30,7 @@ def env(tmp_path): with open(subdir / "nested.txt", "w") as f: f.write("nested file content") - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) bash_tool = Toolbox.get_tool("bash") env.add_tool(bash_tool) env.reset() diff --git a/tests/gym/tools/test_eval.py b/tests/gym/tools/test_eval.py index 7279de81..4bae1026 100644 --- a/tests/gym/tools/test_eval.py +++ b/tests/gym/tools/test_eval.py @@ -2,7 +2,7 @@ import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -15,7 +15,7 @@ def env(tmp_path): with open(repo_path / "test_1.py", "w") as f: f.write("def test_1():\n assert False\n") - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) env.reset() return env diff --git a/tests/gym/tools/test_grep.py b/tests/gym/tools/test_grep.py index 9d3e7b4e..b594bd6f 100644 --- a/tests/gym/tools/test_grep.py +++ b/tests/gym/tools/test_grep.py @@ -1,10 +1,6 @@ -import os -import tempfile -from pathlib import Path - import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.grep import GrepTool @@ -35,7 +31,7 @@ def hello_world(): class TestClass: def __init__(self): self.value = 42 - + def method_with_bug(self): # TODO: Fix this bug return self.value / 0 # This will cause a division by zero error @@ -62,7 +58,7 @@ def load_config(filename): class EmailValidator: def __init__(self): self.pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - + def validate(self, email): return re.match(self.pattern, email) is not None """ @@ -209,7 +205,7 @@ def _setup_grep_repo_env(base_dir, ignore_patterns=None, readonly_patterns=None) with (test_repo / ".debugreadonly").open("w") as f: f.write("\n".join(readonly_patterns)) - env = RepoEnv(path=str(test_repo)) + env = LocalEnv(path=str(test_repo)) grep_tool = GrepTool() env.reset() return grep_tool, env diff --git a/tests/gym/tools/test_listdir.py b/tests/gym/tools/test_listdir.py index 4198266a..c405ae05 100644 --- a/tests/gym/tools/test_listdir.py +++ b/tests/gym/tools/test_listdir.py @@ -1,6 +1,6 @@ import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.listdir import ListdirTool @@ -8,7 +8,7 @@ def setup_listdir_repo_env(setup_test_repo): def _setup_listdir_repo_env(base_dir): test_repo = setup_test_repo(base_dir) - env = RepoEnv(path=str(test_repo)) + env = LocalEnv(path=str(test_repo)) listdir_tool = ListdirTool() listdir_tool.register(env) env.reset() diff --git a/tests/gym/tools/test_pdb.py b/tests/gym/tools/test_pdb.py index 23232ce9..0b6caf13 100644 --- a/tests/gym/tools/test_pdb.py +++ b/tests/gym/tools/test_pdb.py @@ -7,10 +7,8 @@ import pytest from debug_gym.gym.entities import Event -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.terminals.docker import DockerTerminal -from debug_gym.gym.terminals.local import LocalTerminal -from debug_gym.gym.terminals.shell_session import ProcessNotRunningError from debug_gym.gym.tools.pdb import PDBTool @@ -60,7 +58,7 @@ def _breakpoints_state(working_dir): def setup_pdb_repo_env(setup_test_repo, setup_breakpoints_state): def _setup_pdb_repo_env(base_dir): test_repo = setup_test_repo(base_dir) - env = RepoEnv(path=str(test_repo)) + env = LocalEnv(path=str(test_repo)) pdb_tool = PDBTool(persistent_breakpoints=True, auto_list=True) pdb_tool.register(env) env.reset() @@ -75,10 +73,8 @@ def _setup_pdb_repo_env(base_dir): def test_pdb_use(tmp_path, setup_test_repo): # Test PDBTool with LocalTerminal, verbose pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv( + env = LocalEnv( path=tests_path, - terminal=terminal, debug_entrypoint="python -m pdb -m pytest -sv .", ) env.reset() @@ -103,10 +99,8 @@ def test_pdb_use(tmp_path, setup_test_repo): def test_pdb_use_empty_command(tmp_path, setup_test_repo): # Test PDBTool with LocalTerminal, verbose pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv( + env = LocalEnv( path=tests_path, - terminal=terminal, debug_entrypoint="python -m pdb -m pytest -sv .", ) env.reset() @@ -120,10 +114,8 @@ def test_pdb_use_empty_command(tmp_path, setup_test_repo): def test_pdb_b_fail_blank_or_comment(tmp_path, setup_test_repo): # Test PDBTool with LocalTerminal, verbose pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv( + env = LocalEnv( path=tests_path, - terminal=terminal, debug_entrypoint="python -m pdb -m pytest -sv .", ) env.reset() @@ -141,10 +133,8 @@ def test_pdb_b_fail_blank_or_comment(tmp_path, setup_test_repo): def test_pdb_pass_empty_path_if_in_session(tmp_path, setup_test_repo): # Test PDBTool with LocalTerminal, verbose pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv( + env = LocalEnv( path=tests_path, - terminal=terminal, debug_entrypoint="python -m pdb -m pytest -sv .", ) env.reset() @@ -164,8 +154,7 @@ def test_pdb_pass_empty_path_if_in_session(tmp_path, setup_test_repo): def test_pdb_use_default_env_entrypoint(tmp_path, setup_test_repo): # Test PDBTool with default env entrypoint, quiet pytest tests_path = str(setup_test_repo(tmp_path)) - terminal = LocalTerminal() - env = RepoEnv(path=tests_path, terminal=terminal) + env = LocalEnv(path=tests_path) env.reset() pdb = PDBTool() initial_output = pdb.start_pdb(env) # "python -m pdb -m pytest -sq ." @@ -202,7 +191,9 @@ def test_pdb_use_docker_terminal(tmp_path, setup_test_repo): ) # no:cacheprovider to avoid .pytest_cache, --tb=short to reduce output debug_entrypoint = "python -m pdb -m pytest -p no:cacheprovider --color=no -sv ." - env = RepoEnv(path=tests_path, terminal=terminal, debug_entrypoint=debug_entrypoint) + env = LocalEnv( + path=tests_path, terminal=terminal, debug_entrypoint=debug_entrypoint + ) env.reset() pdb = PDBTool() pdb.start_pdb(env) @@ -228,8 +219,8 @@ def test_initialization(): assert pdb_tool._session is None -def test_register(): - env = RepoEnv() +def test_register(tmp_path): + env = LocalEnv(path=tmp_path) pdb_tool = PDBTool() pdb_tool.register(env) # every tool listen to ENV_RESET event to track history @@ -369,7 +360,7 @@ def test_pdb_crashing(tmp_path, setup_test_repo): with open(tests_path / "test_fail.py", "w") as f: f.write("def test_fail():\nassert False") # IndentationError - env = RepoEnv( + env = LocalEnv( path=tests_path, entrypoint="python -m pytest -s test.py", debug_entrypoint="python -m pdb -m pytest -s test_fail.py", @@ -390,7 +381,7 @@ def test_pdb_timeout(tmp_path, setup_test_repo): "def test_fail():\n print('Sleeping...'); import time; time.sleep(10)" ) # IndentationError - env = RepoEnv( + env = LocalEnv( path=tests_path, entrypoint="python -m pytest -s test.py", debug_entrypoint="python -m pdb -m pytest -sv test_fail.py", diff --git a/tests/gym/tools/test_rewrite.py b/tests/gym/tools/test_rewrite.py index e8ad0772..003f31e6 100644 --- a/tests/gym/tools/test_rewrite.py +++ b/tests/gym/tools/test_rewrite.py @@ -2,7 +2,7 @@ import pytest -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.rewrite import RewriteTool @@ -23,7 +23,7 @@ def env(tmp_path): with open(repo_path / "test.py", "w") as f: f.write(file_content) - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) rewrite_tool = RewriteTool() env.add_tool(rewrite_tool) diff --git a/tests/gym/tools/test_tool.py b/tests/gym/tools/test_tool.py index 010526cd..a724befe 100644 --- a/tests/gym/tools/test_tool.py +++ b/tests/gym/tools/test_tool.py @@ -1,7 +1,10 @@ +from pathlib import Path + import pytest from debug_gym.gym.entities import Observation -from debug_gym.gym.envs.env import Event, RepoEnv +from debug_gym.gym.envs.env import Event +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.tool import EnvironmentTool, Record from debug_gym.gym.tools.toolbox import Toolbox @@ -13,9 +16,14 @@ def use(self, env, action): return Observation("FakeTool", action) -def test_register_valid_environment(): +@pytest.fixture +def env(tmp_path): + env = LocalEnv(path=tmp_path) + return env + + +def test_register_valid_environment(env): tool = FakeTool() - env = RepoEnv() tool.register(env) # every tool listen to ENV_RESET event to track history assert tool in env.event_hooks.event_listeners[Event.ENV_RESET] @@ -46,7 +54,7 @@ class CompletelyFakeTool(EnvironmentTool): tool = CompletelyFakeTool() -def test_auto_subscribe(monkeypatch): +def test_auto_subscribe(monkeypatch, env): @Toolbox.register() class ToolWithHandler(FakeTool): @@ -55,7 +63,6 @@ def on_env_reset(self, **kwargs): tool = ToolWithHandler() - env = RepoEnv() env.add_tool(tool) assert tool in env.event_hooks.event_listeners[Event.ENV_RESET] @@ -65,9 +72,8 @@ def on_env_reset(self, **kwargs): assert tool not in env.event_hooks.event_listeners[channel] -def test_track_history(): +def test_track_history(env): tool = FakeTool() - env = RepoEnv() assert hasattr(tool, "history") assert isinstance(tool.history, list) @@ -90,18 +96,16 @@ def test_track_history(): ) -def test_unknown_args(): +def test_unknown_args(env): tool = FakeTool() - env = RepoEnv() obs = tool(env, unknown_arg="unknown_value") assert obs == Observation( "FakeTool", "FakeTool.use() got an unexpected keyword argument 'unknown_arg'" ) -def test_unregister(): +def test_unregister(env): tool = FakeTool() - env = RepoEnv() tool.register(env) # Verify tool is registered @@ -120,7 +124,7 @@ def test_unregister_invalid_environment(): tool.unregister(object()) -def test_unregister_with_multiple_handlers(): +def test_unregister_with_multiple_handlers(env): class ToolWithMultipleHandlers(FakeTool): def on_env_reset(self, environment, **kwargs): return "Handler for Event.ENV_RESET" @@ -129,7 +133,6 @@ def on_env_step(self, environment, **kwargs): return "Handler for Event.ENV_STEP" tool = ToolWithMultipleHandlers() - env = RepoEnv() tool.register(env) # Verify tool is registered for both events diff --git a/tests/gym/tools/test_view.py b/tests/gym/tools/test_view.py index ec2742bb..5d9f5e10 100644 --- a/tests/gym/tools/test_view.py +++ b/tests/gym/tools/test_view.py @@ -3,7 +3,7 @@ import pytest from debug_gym.gym.entities import Observation -from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.tools.tool import ToolCall from debug_gym.gym.tools.toolbox import Toolbox @@ -29,7 +29,7 @@ def env(tmp_path): (repo_path / "empty.py").touch() # Create an empty file - env = RepoEnv(path=repo_path) + env = LocalEnv(path=repo_path) view_tool = Toolbox.get_tool("view") env.add_tool(view_tool) env.reset()