Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
7e78e53
kube
Nov 27, 2025
b448767
uuid as name
Nov 27, 2025
9ac9226
normalize pod name
Nov 27, 2025
cebbf37
load dataset outside
Nov 27, 2025
ffd87cf
remove tolerations
Nov 27, 2025
0d29967
incorporate dataset loading
Nov 27, 2025
c7afaa2
some type annotations
Nov 27, 2025
c506fe1
fixture first fix
Nov 27, 2025
aab04ff
fix
Nov 27, 2025
cc8f813
fix tests
Nov 27, 2025
77aeb78
simplify filtering
Nov 27, 2025
28caf41
remove deps on swesmith! also fix excluded_ids for swesmith
Nov 27, 2025
d9b76c7
remove swesmith
Nov 27, 2025
e13462b
Merge remote-tracking branch 'origin/main' into envs_for_images
MarcCote Nov 28, 2025
928c1d8
load dataset as class method / setup_task
Nov 28, 2025
b338e1c
fix tests
Nov 28, 2025
0858bea
change run.py
Nov 28, 2025
35a4f66
blacked
Nov 28, 2025
e9600ed
remove imports
Nov 28, 2025
81b2eda
task name / task data adaptation
Nov 28, 2025
3468a62
pre commit
Nov 28, 2025
c56579c
cls keyword
Nov 28, 2025
4b01ac8
remove load dataset
Nov 28, 2025
0dd0f4e
Working on tests + refactoring
MarcCote Nov 28, 2025
e6fcd58
Adding back swesmith
MarcCote Nov 28, 2025
c80e6d8
Fixing tests.
MarcCote Dec 1, 2025
7d8268e
Print disk space after installing library.
MarcCote Dec 1, 2025
866ac39
When creating ficture env, reset the env in master thread first
MarcCote Dec 1, 2025
1f4661a
Disabling async pytests
MarcCote Dec 1, 2025
424b3dd
Reenable async pytests + make sure to provide specific problem to loa…
MarcCote Dec 1, 2025
e0263a2
Fixing load_dataset
MarcCote Dec 1, 2025
e5cda5a
Limiting workers for async pytest
MarcCote Dec 1, 2025
f9d0875
Debugging pytest-async
MarcCote Dec 2, 2025
3959705
Fixing load_dataset to be more memoery efficient
MarcCote Dec 2, 2025
22121f3
Update run.py and config files
MarcCote Dec 2, 2025
48b7e60
Making sure solution agent works on SWE-Bench and SWESmith
MarcCote Dec 3, 2025
a7e9586
Make sure R2EGym solution agent works.
MarcCote Dec 3, 2025
6bcfd88
Fix tests about load_config
MarcCote Dec 3, 2025
a55cd11
Fix tests
MarcCote Dec 3, 2025
64491d1
Change default instructions for LocalEnv
MarcCote Dec 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/actions/test-if-changes/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@ runs:
else
pip install "debug-gym[dev]==${{ inputs.version }}"
fi
df -h
- name: Run tests
env:
DEBUG_GYM_DEBUG: 1
shell: bash
run: |
free -h
pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing
- name: Store coverage report
uses: actions/upload-artifact@v4
Expand Down
2 changes: 1 addition & 1 deletion debug_gym/agents/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,5 +371,5 @@ def create_agent(
if agent_args is None:
raise ValueError("Either agent_args or config must be provided.")

agent = agent_class(args=agent_args, **agent_kwargs)
agent = agent_class(agent_args=agent_args, **agent_kwargs)
return agent
50 changes: 29 additions & 21 deletions debug_gym/agents/solution_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,28 +39,36 @@ def run(self, env, debug=False):
return True

self.logger.info(f"Score: {info.score}/{info.max_score or '-'}")
# Make a simple pdb call to make sure it is working.
action = ToolCall(name="pdb", id="pdb", arguments={"command": "help help"})
pdb_help_info = self.env.step(action, None, None)
assert "h(elp)" in pdb_help_info.step_observation.observation, (
"PDB command did not return expected help message.\n"
f"{pdb_help_info.step_observation.observation}"
)

# Send a pdb continue command, and check the output matches the one from env.reset.
action = ToolCall(name="pdb", id="pdb", arguments={"command": "continue"})
pdb_continue_info = self.env.step(action, None, None)

assert (
"Reached the end of the program. Restarting the debugging session."
in pdb_continue_info.step_observation.observation
) or (
info.step_observation.observation.splitlines()[-1]
in pdb_continue_info.step_observation.observation
), (
"PDB command did not return expected continue message.\n"
f"{pdb_continue_info.step_observation.observation}"
)
if env.has_tool("pdb"):
# Make a simple pdb call to make sure it is working.
action = ToolCall(
name="pdb", id="pdb", arguments={"command": "help help"}
)
pdb_help_info = self.env.step(action, None, None)
assert "h(elp)" in pdb_help_info.step_observation.observation, (
"PDB command did not return expected help message.\n"
f"{pdb_help_info.step_observation.observation}"
)

# Send a pdb continue command, and check the output matches the one from env.reset.
action = ToolCall(
name="pdb", id="pdb", arguments={"command": "continue"}
)
pdb_continue_info = self.env.step(action, None, None)

pdb_observation = pdb_continue_info.step_observation.observation
expected_messages = [
"Reached the end of the program. Restarting the debugging session.",
"Uncaught exception. Entering post mortem debugging",
]
reset_observation = info.step_observation.observation
if reset_observation.splitlines():
expected_messages.append(reset_observation.splitlines()[-1])

assert any(
msg in pdb_observation for msg in expected_messages
), f"PDB command did not return expected continue message.\n{pdb_observation}"

self.env.apply_gold_patch()

Expand Down
28 changes: 15 additions & 13 deletions debug_gym/agents/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,6 @@ def load_config():
with open(args.config_file) as reader:
config = yaml.safe_load(reader)

# Parse overriden params.
for param in args.params:
fqn_key, value = param.split("=")
entry_to_change = config
keys = fqn_key.split(".")
for k in keys[:-1]:
entry_to_change = entry_to_change[k]
entry_to_change[keys[-1]] = yaml.safe_load(value)

available_agents = [item for item in list(config.keys()) if item != "base"]

if not args.agent:
Expand All @@ -130,14 +121,25 @@ def load_config():
if "base" in config:
# base config is specified (shared across agents)
return_config = config["base"]
agent_specific_config = config[args.agent]
for key in agent_specific_config:
# override base config with agent specific config
return_config[key] = agent_specific_config[key]
# Override base config with agent specific config
for key, value in config[args.agent].items():
return_config[key] = value
else:
# base config is not specified
return_config = config[args.agent]

# Parse overriden params.
for param in args.params:
fqn_key, value = param.split("=")
entry_to_change = return_config
keys = fqn_key.split(".")
for k in keys[:-1]:
if k not in entry_to_change:
entry_to_change[k] = {}

entry_to_change = entry_to_change[k]
entry_to_change[keys[-1]] = yaml.safe_load(value)

# assume agent type is the key if not specified by the user
if not return_config.get("agent_type"):
return_config["agent_type"] = args.agent
Expand Down
24 changes: 21 additions & 3 deletions debug_gym/gym/envs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from debug_gym.gym.envs.aider import AiderBenchmarkEnv
from debug_gym.gym.envs.env import RepoEnv, TooledEnv
from debug_gym.gym.envs.local import LocalEnv
from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
from debug_gym.gym.envs.r2egym import R2EGymEnv
from debug_gym.gym.envs.swe_bench import SWEBenchEnv
from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv
from debug_gym.gym.envs.swe_smith import SWESmithEnv
from debug_gym.logger import DebugGymLogger


def select_env(env_type: str = None) -> type[RepoEnv]:
match env_type:
case None:
return RepoEnv
case "local":
return LocalEnv
case "aider":
return AiderBenchmarkEnv
case "swebench":
Expand All @@ -24,4 +26,20 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
case "r2egym":
return R2EGymEnv
case _:
raise ValueError(f"Unknown benchmark {env_type}")
raise ValueError(f"Unknown environment {env_type}")


def load_dataset(config: dict, logger: DebugGymLogger | None = None) -> dict:
"""Load dataset based on the given config."""
if config.get("type") is None:
raise ValueError("Dataset config must specify 'type' field.")

try:
env = select_env(config.get("type"))
except ValueError as e:
raise ValueError(
f"Unknown environment type '{config.get('type')}' from dataset's config: {config}"
)

dataset = env.load_dataset(logger=logger, **config)
return dataset
49 changes: 36 additions & 13 deletions debug_gym/gym/envs/aider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
import subprocess
import tempfile
Expand All @@ -7,16 +8,20 @@
from debug_gym.constants import DEBUG_GYM_CACHE_DIR
from debug_gym.gym.entities import EvalOutput
from debug_gym.gym.envs.env import RepoEnv
from debug_gym.gym.envs.local import LocalEnv
from debug_gym.gym.terminals.docker import DockerTerminal
from debug_gym.gym.terminals.terminal import Terminal
from debug_gym.logger import DebugGymLogger

DOCKER_AIDER_IMAGE_NAME = "debug-gym:aider"


def build_docker_image(logger):
def build_docker_image(logger: logging.Logger | None = None):
"""
Build a Docker image for the Mini Nightmare environment.
"""
logger = logger or DebugGymLogger("debug-gym")

# Check if Docker image is built.
import docker

Expand Down Expand Up @@ -62,6 +67,7 @@ class AiderBenchmarkEnv(RepoEnv):

def __init__(
self,
task_data: dict,
entrypoint: str = "python -m pytest --tb=no -s .",
terminal: Terminal | None = None,
**kwargs,
Expand All @@ -73,7 +79,13 @@ def __init__(
if hasattr(terminal, "base_image") and terminal.base_image is None:
terminal.base_image = DOCKER_AIDER_IMAGE_NAME

super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
super().__init__(
task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
)

@property
def task_name(self) -> str:
return self.current_task["task_name"]

@property
def instructions(self) -> str:
Expand All @@ -91,10 +103,8 @@ def eval(self, **kwargs) -> EvalOutput:
self.last_eval = EvalOutput(success, output)
return self.last_eval

def setup_task(self, task_name: str, options: dict = None):
if task_name not in self.dataset:
raise ValueError(f"Task {task_name} not found in the dataset.")
self.current_task = self.dataset[task_name]
def setup_task(self):
self.current_task = self.task_data

def setup_workspace(self):
self.workspace.reset()
Expand Down Expand Up @@ -122,14 +132,21 @@ def setup_terminal(self):
) # Aider tasks come with those.
self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")

def load_dataset(self, problems: str | list[str] | None = None):
if isinstance(self.terminal, DockerTerminal):
build_docker_image(self.logger)
@classmethod
def load_dataset(
cls,
problems: str | list[str] | None = None,
build_image: bool = True,
logger: object = None,
**kwargs,
) -> dict:
if build_image:
build_docker_image(logger)

if not os.path.exists(self.REPO_PATH):
subprocess.run(["git", "clone", self.REPO_URL, self.REPO_PATH], check=True)
if not os.path.exists(cls.REPO_PATH):
subprocess.run(["git", "clone", cls.REPO_URL, cls.REPO_PATH], check=True)

practice_path = self.REPO_PATH / "exercises" / "practice"
practice_path = cls.REPO_PATH / "exercises" / "practice"
directories = [d for d in practice_path.iterdir() if d.is_dir()]

dataset = {}
Expand Down Expand Up @@ -160,11 +177,17 @@ def load_dataset(self, problems: str | list[str] | None = None):
)

dataset[task_name] = {
"task_name": task_name,
"codebase": directory,
"instructions": instructions,
"filename": task_name + ".py",
}

problems = utils.filter_problems(dataset, problems)
dataset = {id: i for id, i in dataset.items() if id in problems}
dataset = {id: data for id, data in dataset.items() if id in problems}

# Add env_type to each task_data.
for task_data in dataset.values():
task_data["env_type"] = "aider"

return dataset
63 changes: 21 additions & 42 deletions debug_gym/gym/envs/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,41 +201,29 @@ class RepoEnv(TooledEnv):

def __init__(
self,
path: str | None = None,
task_data: dict,
entrypoint: str = "python -m pytest -sq .",
debug_entrypoint: str | None = None,
max_score: int | None = None,
readonly_patterns: list[str] | None = None, # TODO: remove
run_timeout: int | None = None,
terminal: Terminal | None = None,
logger: DebugGymLogger | None = None,
problems: str | list[str] | None = None,
**kwargs,
):
super().__init__()

self.path = path
self.task_data = task_data
self.max_score = max_score
self.run_timeout = run_timeout
self.terminal = terminal or LocalTerminal() # TODO: default to DockerTerminal
self.terminal = terminal
self._entrypoint = entrypoint
self._debug_entrypoint = debug_entrypoint
self.logger = logger or DebugGymLogger("debug-gym")
self.infos: EnvInfo | None = None
self.rng = None
self.additional_kwargs = kwargs
self.task_name: str | None = None
self.options: dict = {}

if "auto_eval_on_rewrite" in kwargs:
raise ValueError(
"The 'auto_eval_on_rewrite' parameter is no longer supported. "
"Please remove it from your initialization arguments."
"Instead, set 'auto_eval_on_rewrite' in the EvalTool instance."
)

self.workspace = Workspace(self.terminal, logger=self.logger)
self.dataset = self.load_dataset(problems)
self.set_entrypoints(self._entrypoint, self._debug_entrypoint)

def _reset_env_state(self):
Expand Down Expand Up @@ -290,45 +278,39 @@ def working_dir(self) -> Path:
def instructions(self) -> str:
"""Instructions for the current task.
Override in subclasses for different behavior."""
return ""
raise NotImplementedError(
"Subclasses must implement the instructions property."
)

def setup_task(self, task_name: str, options: dict = None) -> None:
@property
def task_name(self) -> str:
raise NotImplementedError("Subclasses must implement the task_name property.")

def setup_task(self) -> None:
"""Setup the task information.
Override in subclasses for different behavior. Called once at reset."""
pass
raise NotImplementedError("Subclasses must implement setup_task method.")

def setup_workspace(self) -> None:
"""Setup the workspace.
Override in subclasses for different behavior. Called once at reset."""
self.workspace.reset()
self.workspace.copy_content(self.path)
self.workspace.setup_file_filters()
raise NotImplementedError("Subclasses must implement setup_workspace method.")

def setup_terminal(self) -> None:
"""Setup the terminal.
Override in subclasses for different behavior. Called once at reset."""

self.logger.debug(f"Configuring {self.terminal}...")

self.terminal.run("git init -b main")
self.terminal.run("git config user.name 'debug-gym'")
self.terminal.run("git config user.email '<>'")

self.terminal.run("git add *")
self.terminal.run("git commit -am 'Init'")

self.terminal.run("git add .debugignore .debugreadonly")
self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")
raise NotImplementedError("Subclasses must implement setup_terminal method.")

def reset(self, *, options: dict = None):
"""Resets the environment and returns eval as the initial observation."""
self.options = options if options is not None else self.options
options = options if options is not None else {}
self.logger.debug("Resetting environment")
self.close() # Clean up previous workspace and terminal.
self.task_name = self.options.get("task_name")
self.setup_task(task_name=self.task_name, options=self.options)
self.setup_workspace()
self.setup_terminal()
if options.get("reset_runtime", True):
self.close() # Clean up previous workspace and terminal.
self.setup_task()
self.setup_workspace()
self.setup_terminal()

self._reset_env_state()

# Notify all tools that the environment is reset and get their observations
Expand Down Expand Up @@ -504,6 +486,3 @@ def close(self):

def __del__(self):
self.close()

def load_dataset(self, problems: str | list[str] | None = None):
return {"custom": None}
Loading
Loading