diff --git a/debug_gym/agents/__init__.py b/debug_gym/agents/__init__.py index 16cd7b67..c955af01 100644 --- a/debug_gym/agents/__init__.py +++ b/debug_gym/agents/__init__.py @@ -1,8 +1,13 @@ +from debug_gym.agents.base_agent import BaseAgent, register_agent +from debug_gym.agents.free_agent import FreeAgent from debug_gym.agents.froggy_agent import FroggyAgent from debug_gym.agents.solution_agent import AgentSolution from debug_gym.agents.swe_agent import SWEAgent __all__ = [ + "BaseAgent", + "register_agent", + "FreeAgent", "FroggyAgent", "AgentSolution", "SWEAgent", diff --git a/debug_gym/agents/free_agent.py b/debug_gym/agents/free_agent.py new file mode 100644 index 00000000..951bde3b --- /dev/null +++ b/debug_gym/agents/free_agent.py @@ -0,0 +1,44 @@ +"""Simple agent example for interacting with FreeEnv.""" + +from debug_gym.agents.base_agent import BaseAgent, register_agent + + +@register_agent +class FreeAgent(BaseAgent): + """Minimal reasoning agent tailored for FreeEnv sessions.""" + + name = "free_agent" + # Customized system instructions keep FreeEnv light-weight while still + # providing the model with a structured exploration checklist. + system_prompt = ( + "You are assisting in an exploratory codebase understanding session inside an open-ended container.\n" + "You have access to a set of tools to inspect and modify the codebase.\n" + "Your goal is to use the tools to gather as much information about the codebase as possible.\n" + "Output both your thinking process (if any) and the tool call (must) in the response.\n" + "When you are done exploring, use the submit tool as the final action to end the session." + ) + + def __init__(self, config, env, llm=None, logger=None): + super().__init__(config=config, env=env, llm=llm, logger=logger) + + override_prompt = config.get("system_prompt") + if override_prompt is not None: + self.system_prompt = str(override_prompt) + + def run(self, task_name=None, debug=False): + """Wrap BaseAgent.run to surface clearer errors when startup fails.""" + try: + return super().run(task_name=task_name, debug=debug) + except AttributeError as exc: + error_msg = str(exc) + sentinel = "'NoneType' object has no attribute 'max_score'" + if sentinel not in error_msg: + raise + + root_cause = exc.__context__ or exc.__cause__ or exc + self.logger.error( + "FreeAgent failed to reset the environment before receiving initial observations. " + "Check that the configured container image exists and is accessible." + ) + + raise root_cause diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py index 45e03705..8327e5bc 100644 --- a/debug_gym/gym/envs/__init__.py +++ b/debug_gym/gym/envs/__init__.py @@ -1,5 +1,6 @@ from debug_gym.gym.envs.aider import AiderBenchmarkEnv from debug_gym.gym.envs.env import RepoEnv, TooledEnv +from debug_gym.gym.envs.free_env import FreeEnv from debug_gym.gym.envs.local import LocalEnv from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv from debug_gym.gym.envs.r2egym import R2EGymEnv @@ -12,6 +13,7 @@ "AiderBenchmarkEnv", "RepoEnv", "TooledEnv", + "FreeEnv", "LocalEnv", "MiniNightmareEnv", "R2EGymEnv", @@ -39,6 +41,8 @@ def select_env(env_type: str = None) -> type[RepoEnv]: return MiniNightmareEnv case "r2egym": return R2EGymEnv + case "free": + return FreeEnv case _: raise ValueError(f"Unknown environment {env_type}") diff --git a/debug_gym/gym/envs/free_env.py b/debug_gym/gym/envs/free_env.py new file mode 100644 index 00000000..1942f249 --- /dev/null +++ b/debug_gym/gym/envs/free_env.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import shlex +from pathlib import Path +from typing import Any + +from debug_gym.gym.envs.env import RepoEnv +from debug_gym.gym.terminals.local import LocalTerminal +from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.logger import DebugGymLogger + + +class FreeEnv(RepoEnv): + """Lightweight RepoEnv wrapper for running arbitrary container images.""" + + DEFAULT_TASK_NAME = "free-session" + + def __init__( + self, + image: str, + *, + terminal: Terminal | None = None, + mount_path: str | Path | None = None, + setup_commands: list[str] | None = None, + instructions: str | None = None, + init_git: bool = True, + workspace_dir: str | Path = "/testbed", + logger: DebugGymLogger | None = None, + **env_kwargs: Any, + ) -> None: + """Create a free-form environment backed by an existing repository terminal.""" + self.container_image = image + self._custom_instructions = (instructions or "").strip() + self.init_git = init_git + self._setup_commands = list(setup_commands or []) + self._workspace_dir = str(workspace_dir) + + shared_logger = logger or DebugGymLogger("debug-gym") + + super().__init__( + path=str(mount_path) if mount_path is not None else None, + entrypoint="true", + debug_entrypoint="true", + max_score=0, + terminal=terminal, + logger=shared_logger, + **env_kwargs, + ) + + if self.terminal is not None: + self._apply_terminal_settings() + + def _apply_terminal_settings(self) -> None: + """Keep terminal metadata (image/setup commands) in sync with env state.""" + terminal = self.terminal + if terminal is None: + return + if hasattr(terminal, "base_image"): + setattr(terminal, "base_image", self.container_image) + + if hasattr(terminal, "setup_commands"): + terminal.setup_commands = list(self._setup_commands) + + if hasattr(terminal, "working_dir") and not isinstance(terminal, LocalTerminal): + try: + terminal.working_dir = self._workspace_dir + except ValueError: + self.logger.debug( + "Terminal already active; keeping working_dir=%s", + getattr(terminal, "working_dir", self._workspace_dir), + ) + + if hasattr(terminal, "task_name"): + try: + terminal.task_name = self.DEFAULT_TASK_NAME + except ValueError: + self.logger.debug( + "Terminal already active; keeping existing task name." + ) + + terminal.logger = self.logger + + def load_dataset(self, problems: str | list[str] | None = None): + """Expose a single synthetic task keyed by DEFAULT_TASK_NAME.""" + return {self.DEFAULT_TASK_NAME: {"image": self.container_image}} + + def setup_task(self, task_name: str | None, options: dict | None = None) -> None: + """Record base image metadata for consistency with RepoEnv expectations.""" + self.task_name = task_name or self.DEFAULT_TASK_NAME + self.base_image = self.container_image + if hasattr(self.terminal, "base_image"): + setattr(self.terminal, "base_image", self.base_image) + + def setup_workspace(self) -> None: + """Ensure the remote workspace matches the configured working directory.""" + if isinstance(self.terminal, LocalTerminal): + super().setup_workspace() + return + + self.workspace.reset() + self.workspace.working_dir = Path(self._workspace_dir) + if self.terminal is not None: + current_dir = getattr(self.terminal, "working_dir", None) + if current_dir != self._workspace_dir: + try: + self.terminal.working_dir = self._workspace_dir + except ValueError: + self.logger.debug( + "Terminal already active; keeping working_dir=%s", current_dir + ) + # Ensure core utilities exist before RepoEnv renders directory listings. + self.terminal.run( + "apt-get update -y && apt-get install -y tree", raises=True + ) + self.terminal.run( + f"mkdir -p {shlex.quote(self._workspace_dir)}", + raises=True, + ) + + if self.path: + self.workspace.copy_content(self.path) + + self.workspace.setup_file_filters() + + def setup_terminal(self) -> None: + """Apply FreeEnv tweaks and reuse RepoEnv git bootstrapping when enabled.""" + self._apply_terminal_settings() + + if self.terminal is not None: + self.terminal.run("touch .debugignore .debugreadonly") + + if not self.init_git: + return + if not self._git_available(): + self.logger.debug( + "Git is not available in the container; skipping repository setup.", + ) + return + super().setup_terminal() + + def _git_available(self) -> bool: + """Check for git presence before attempting repository initialization.""" + if self.terminal is None: + return False + success, _ = self.terminal.run("command -v git") + return success + + @property + def instructions(self) -> str: + """Provide user-facing guidance, falling back to a generic sandbox blurb.""" + return ( + self._custom_instructions + or "You are placed in an isolated Linux environment, use the available tools to interact with the environment effectively." + ) + + def reset(self, *, options: dict | None = None): + """Allow callers to mutate container settings before delegating to RepoEnv.""" + options = options or {} + + image = options.get("image") + workspace_dir = options.get("workspace_dir") + setup_commands = options.get("setup_commands") + instructions = options.get("instructions") + init_git = options.get("init_git") + + restart_terminal = False + + if image and image != self.container_image: + self.container_image = image + restart_terminal = True + + if workspace_dir and str(workspace_dir) != self._workspace_dir: + self._workspace_dir = str(workspace_dir) + restart_terminal = True + + if setup_commands is not None: + new_commands = list(setup_commands) + if new_commands != self._setup_commands: + self._setup_commands = new_commands + restart_terminal = True + + if instructions is not None: + self._custom_instructions = instructions + + if init_git is not None: + self.init_git = bool(init_git) + + if restart_terminal and self.terminal is not None: + try: + self.terminal.close() + except Exception as exc: # noqa: BLE001 - diagnostics only + self.logger.debug("Failed to close terminal cleanly: %s", exc) + + self._apply_terminal_settings() + + return super().reset(options=options) diff --git a/debug_gym/gym/terminals/__init__.py b/debug_gym/gym/terminals/__init__.py index 068a8b6a..1f34e1d8 100644 --- a/debug_gym/gym/terminals/__init__.py +++ b/debug_gym/gym/terminals/__init__.py @@ -13,8 +13,20 @@ def select_terminal( if terminal_config is None: return None + if isinstance(terminal_config, Terminal): + return terminal_config + + if not isinstance(terminal_config, dict): + raise TypeError( + "terminal configuration must be a dict, Terminal instance, or None", + ) + + config = dict(terminal_config) + terminal_type = str(config.pop("type", "")).lower() + if not terminal_type: + raise ValueError("Terminal configuration must include a 'type' key") + logger = logger or DebugGymLogger("debug-gym") - terminal_type = terminal_config["type"] match terminal_type: case "docker": terminal_class = DockerTerminal @@ -25,8 +37,17 @@ def select_terminal( case _: raise ValueError(f"Unknown terminal {terminal_type}") + extra_labels = config.pop("extra_labels", {}) or {} + if uuid is not None: + extra_labels = {**extra_labels, "uuid": uuid} + + if terminal_class is KubernetesTerminal and extra_labels: + config["extra_labels"] = extra_labels + + if terminal_class is not KubernetesTerminal: + config.pop("extra_labels", None) + return terminal_class( - **terminal_config, logger=logger, - extra_labels={"uuid": uuid}, + **config, ) diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index d628cabe..f89e0fb9 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -381,9 +381,47 @@ def default_shell_command(self) -> list[str]: bash_cmd = "/bin/bash --noprofile --norc --noediting" return f"kubectl {kubeconfig}exec -it {self.pod.name} -c main -n {self.pod.namespace} -- {bash_cmd}" + def _ensure_pod_running(self) -> None: + """Ensure the backing pod exists and is in Running phase.""" + if self._pod is None: + self.setup_pod() + return + + try: + if self._pod.is_running(): + return + except Exception as exc: # noqa: BLE001 - diagnostics only + self.logger.debug(f"{self._pod} status check failed: {exc}") + + self.logger.debug(f"{self._pod} not running anymore.") + + # Check logs and describe for diagnostics + try: + pod_logs = self.k8s_client.read_namespaced_pod_log( + name=self._pod.name, namespace=self._pod.namespace + ) + pod_description = self.k8s_client.read_namespaced_pod( + name=self._pod.name, namespace=self._pod.namespace + ) + self.logger.debug( + f"[{self._pod.name}] Pod logs before failure:\n{pod_logs}\n" + f"Pod description before failure:\n{pod_description}" + ) + except Exception as log_exc: + self.logger.debug( + f"[{self._pod.name}] Failed to get pod logs/description: {log_exc}" + ) + + self.logger.debug(f"Cleaning up {self._pod} after failure.") + try: + self._pod.clean_up() + except Exception as exc: # noqa: BLE001 - best-effort cleanup + self.logger.debug(f"Failed to clean up {self._pod}: {exc}") + + raise RuntimeError("Pod is not running anymore.") + def new_shell_session(self): - if not self.pod.is_running(): - raise ValueError("Pod is not running. Cannot create shell session.") + self._ensure_pod_running() session = ShellSession( shell_command=self.default_shell_command, @@ -427,8 +465,7 @@ def run( strip_output: bool = True, ) -> tuple[bool, str]: """Run a command in the pod. Return command status and output.""" - if not self.pod.is_running(): - raise ValueError("Pod is not running. Cannot run commands.") + self._ensure_pod_running() command = self.prepare_command(entrypoint) @@ -466,8 +503,25 @@ def run( except ApiException as e: success = False self.logger.debug( - f"[{self.pod.name}] Exception during command execution: {e}" + f"[{self.pod.name}] Exception during command `{command}`: {e}" ) + # Get kubectl logs and describe for diagnostics + try: + pod_logs = self.k8s_client.read_namespaced_pod_log( + name=self.pod.name, namespace=self.pod.namespace + ) + pod_description = self.k8s_client.read_namespaced_pod( + name=self.pod.name, namespace=self.pod.namespace + ) + self.logger.debug( + f"[{self.pod.name}] Pod logs:\n{pod_logs}\n" + f"Pod description:\n{pod_description}" + ) + except Exception as log_exc: + self.logger.debug( + f"[{self.pod.name}] Failed to get pod logs/description: {log_exc}" + ) + output = f"Command execution failed: {str(e)}" backoff = random.uniform(5, 10) # seconds time.sleep(backoff) @@ -498,7 +552,7 @@ def setup_pod(self, max_retries: int = 3) -> None: for attempt in range(max_retries): # Generate a new pod name for each attempt to avoid sandbox conflicts pod_name = _clean_for_kubernetes( - self._pod_name or f"dbg-gym-{self.task_name}-{str(uuid.uuid4())[:8]}" + self._pod_name or f"dbg-gym.{self.task_name}.{str(uuid.uuid4())[:8]}" ) self.logger.debug( f"Setting up pod {pod_name} (attempt {attempt + 1}/{max_retries}) " diff --git a/debug_gym/gym/tools/submit.py b/debug_gym/gym/tools/submit.py index eb08e578..00514992 100644 --- a/debug_gym/gym/tools/submit.py +++ b/debug_gym/gym/tools/submit.py @@ -9,7 +9,14 @@ class SubmitTool(EnvironmentTool): description = "Submit your changes once the task is complete." arguments = {} + def __init__(self, eval_on_submit=True): + super().__init__() + self.eval_on_submit = eval_on_submit + def use(self, environment, **kwargs) -> Observation: - eval_output = environment.eval() + output = "The agent terminated the session." + if self.eval_on_submit: + output = environment.eval().output + environment.terminated = True - return Observation(self.name, eval_output.output) + return Observation(self.name, output) diff --git a/scripts/config_free_env.yaml b/scripts/config_free_env.yaml new file mode 100644 index 00000000..2f9b06cc --- /dev/null +++ b/scripts/config_free_env.yaml @@ -0,0 +1,42 @@ +# Configuration for standalone FreeEnv + FreeAgent runs. +task_name: free-session + +llm: + name: "4o-az" + +# Tools to load into the environment toolbox. +tools: + - rewrite + - bash + - submit: + eval_on_submit: False # Here we only terminate after submission, no auto-eval. + +environment: + image: jyangballin/swesmith.x86_64.amueller_1776_word_cloud.ec24191c:latest + workspace_dir: /testbed + terminal: + type: docker + # type: kubernetes + # registry: debuggymacr.azurecr.io + # namespace: mtl-cpu-jobs + # kube_config: ~/.kube/config + # # kube_context: null + # pod_spec_kwargs: + # tolerations: + # - key: node.kubernetes.io/disk-pressure + # operator: Exists + # effect: NoExecute + # tolerationSeconds: 10800 + # - key: kubernetes.azure.com/scalesetpriority + # operator: Equal + # value: spot + # effect: NoSchedule + # - key: CriticalAddonsOnly + # operator: Equal + # value: "true" + # effect: NoSchedule + +agent: + random_seed: 42 + max_steps: 20 + output_path: exps/free_env diff --git a/scripts/free_env_human.py b/scripts/free_env_human.py new file mode 100644 index 00000000..a82a9e6a --- /dev/null +++ b/scripts/free_env_human.py @@ -0,0 +1,200 @@ +"""Interactive FreeEnv demo that runs a container image with a human operator.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Any, Iterable + +from debug_gym.gym.envs.free_env import FreeEnv +from debug_gym.gym.terminals import select_terminal +from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.gym.tools.toolbox import Toolbox +from debug_gym.llms.human import Human +from debug_gym.logger import DebugGymLogger + +DEFAULT_IMAGE = "swesmith.x86_64.amueller__word_cloud.ec24191c" +DEFAULT_TOOLS = [ + "listdir", + "view", + "grep", + "rewrite", + "bash", + {"submit": {"eval_on_submit": False}}, +] + + +def format_observations(env_info) -> list[dict]: + messages = [ + { + "role": "system", + "content": env_info.instructions or "Interact with the repository.", + } + ] + + instructions_text = (env_info.instructions or "").strip() + for index, observation in enumerate(env_info.all_observations): + text = observation.observation.strip() + if index == 0 and text == instructions_text: + continue + prefix = f"[{observation.source}] " if observation.source else "" + messages.append({"role": "user", "content": f"{prefix}{text}"}) + return messages + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Launch a FreeEnv session with human-in-the-loop control.", + ) + parser.add_argument( + "--image", + default=DEFAULT_IMAGE, + help="Docker image name to load inside the environment.", + ) + parser.add_argument( + "--terminal", + default="docker", + choices=["docker", "kubernetes"], + help="Terminal backend to use.", + ) + parser.add_argument( + "--registry", + default=None, + help="Optional registry prefix (e.g. ghcr.io/swe-bench).", + ) + parser.add_argument( + "--workspace-dir", + default="/testbed", + help="Working directory inside the container or pod.", + ) + parser.add_argument( + "--mount-path", + type=Path, + default=None, + help="Optional host path whose contents should be copied into the environment.", + ) + parser.add_argument( + "--setup-command", + action="append", + default=[], + help="Additional setup commands to run when the terminal starts (repeatable).", + ) + parser.add_argument( + "--tool", + dest="tools", + action="append", + default=None, + help="Tool name to add to the toolbox (can be specified multiple times).", + ) + parser.add_argument( + "--init-git", + action="store_true", + help="Initialize a git repository inside the environment (disabled by default).", + ) + parser.add_argument( + "--instructions", + default=None, + help="Custom instruction text displayed at reset.", + ) + parser.add_argument( + "--max-retries", + type=int, + default=10, + help="Maximum number of retries for invalid human tool calls.", + ) + parser.add_argument( + "--dir-tree-depth", + type=int, + default=2, + help="Depth of the directory tree shown in observations.", + ) + return parser + + +def _add_tools(env: FreeEnv, tool_specs: Iterable[Any], logger: DebugGymLogger) -> None: + """Attach toolbox entries, defaulting submit to eval_on_submit=False for humans.""" + + for spec in tool_specs: + tool_kwargs: dict[str, Any] = {} + if isinstance(spec, dict): + if len(spec) != 1: + raise ValueError("Tool dictionary must contain exactly one entry") + spec = dict(spec) + tool_name, tool_kwargs = next(iter(spec.items())) + else: + tool_name = str(spec) + + if tool_name == "submit" and "eval_on_submit" not in tool_kwargs: + tool_kwargs = {**tool_kwargs, "eval_on_submit": False} + + env.add_tool(Toolbox.get_tool(tool_name, **tool_kwargs)) + logger.debug("Loaded tool %s with options %s", tool_name, tool_kwargs) + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + logger = DebugGymLogger("free-env-demo") + + tool_specs: list[Any] + if args.tools: + # User-specified tools override defaults but still respect submit behaviour. + tool_specs = list(args.tools) + else: + tool_specs = list(DEFAULT_TOOLS) + + terminal_config: dict[str, Any] = { + "type": args.terminal, + "base_image": args.image, + "working_dir": args.workspace_dir, + } + if args.setup_command: + terminal_config["setup_commands"] = list(args.setup_command) + if args.registry: + terminal_config["registry"] = args.registry + + terminal: Terminal | None = select_terminal(terminal_config, logger=logger) + + env = FreeEnv( + image=args.image, + terminal=terminal, + mount_path=args.mount_path, + setup_commands=args.setup_command, + instructions=args.instructions, + init_git=args.init_git, + workspace_dir=args.workspace_dir, + logger=logger, + dir_tree_depth=args.dir_tree_depth, + ) + + _add_tools(env, tool_specs, logger) + logger.info("Loaded tools: %s", env.tool_names) + + info = env.reset() + human = Human(logger=logger, max_retries=args.max_retries) + + try: + while True: + messages = format_observations(info) + response = human(messages, env.tools) + logger.info( + "Running %s with arguments %s", + response.tool.name, + response.tool.arguments, + ) + info = env.step( + response.tool, + action_content=response.response, + ) + except KeyboardInterrupt: + logger.info("Session interrupted by user.") + except ValueError as exc: + logger.error("Session terminated: %s", exc) + finally: + env.close() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_free_env.py b/scripts/run_free_env.py new file mode 100644 index 00000000..13d7b367 --- /dev/null +++ b/scripts/run_free_env.py @@ -0,0 +1,163 @@ +"""Standalone runner for FreeEnv + FreeAgent with human-visible logging.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Any, Mapping + +from debug_gym.agents.free_agent import FreeAgent +from debug_gym.gym.envs.free_env import FreeEnv +from debug_gym.gym.terminals import select_terminal +from debug_gym.gym.terminals.terminal import Terminal +from debug_gym.gym.tools.toolbox import Toolbox +from debug_gym.llms.base import LLM +from debug_gym.llms.human import Human +from debug_gym.logger import DebugGymLogger + + +def build_parser() -> argparse.ArgumentParser: + """Create the CLI parser that exposes the runner configuration flag.""" + parser = argparse.ArgumentParser(description="Run FreeAgent against FreeEnv.") + parser.add_argument( + "--config", + type=Path, + default=Path("scripts/config_free_env.yaml"), + help="Path to the YAML configuration file.", + ) + return parser + + +def load_app_config(path: Path) -> dict: + """Load the YAML configuration used to seed the environment and agent.""" + import yaml + + with open(path, "r", encoding="utf-8") as handle: + return yaml.safe_load(handle) + + +def build_llm(config: dict, logger: DebugGymLogger): + """Instantiate the LLM (or human driver) based on configuration defaults.""" + llm_cfg = config.get("llm") or {} + llm_name = llm_cfg.get("name") or config.get("llm_name") or "human" + + if llm_name.lower() == "human": + return Human(model_name="human", logger=logger) + + return LLM.instantiate( + llm_name=llm_name, + llm_config_file_path=llm_cfg.get("config_file") + or config.get("llm_config_file_path"), + logger=logger, + ) + + +def resolve_terminal( + env_config: Mapping[str, Any], + logger: DebugGymLogger, +) -> Terminal | None: + """Resolve the requested terminal backend, normalizing legacy config shapes.""" + terminal_setting = env_config.get("terminal") + + if isinstance(terminal_setting, Terminal): + return terminal_setting + + if terminal_setting is None: + terminal_config: dict[str, Any] = {"type": "docker"} + elif isinstance(terminal_setting, str): + terminal_config = {"type": terminal_setting} + elif isinstance(terminal_setting, Mapping): + terminal_config = dict(terminal_setting) + else: + raise TypeError( + "terminal configuration must be a mapping, string, Terminal, or None", + ) + + terminal_config.setdefault("type", "docker") + terminal_config["type"] = str(terminal_config["type"]).lower() + terminal_config.setdefault("base_image", env_config["image"]) + terminal_config.setdefault( + "working_dir", env_config.get("workspace_dir", "/testbed") + ) + + setup_commands = env_config.get("setup_commands") + if setup_commands: + terminal_config.setdefault("setup_commands", list(setup_commands)) + + overrides = dict(env_config.get("terminal_kwargs") or {}) + terminal_config.update(overrides) + + return select_terminal(terminal_config, logger=logger) + + +def add_tools(env: FreeEnv, tools_config: list[Any], logger: DebugGymLogger) -> None: + """Instantiate tools defined in config, honoring optional per-tool kwargs.""" + + for tool_entry in tools_config: + tool_kwargs: dict[str, Any] = {} + if isinstance(tool_entry, Mapping): + if len(tool_entry) != 1: + raise ValueError("Tool mapping entries must contain a single tool name") + tool_entry = dict(tool_entry) + tool_name, tool_kwargs = next(iter(tool_entry.items())) + else: + tool_name = str(tool_entry) + + if tool_name == "submit" and "eval_on_submit" not in tool_kwargs: + tool_kwargs = {**tool_kwargs, "eval_on_submit": False} + + env.add_tool(Toolbox.get_tool(tool_name, **tool_kwargs)) + logger.debug("Added tool %s with options %s", tool_name, tool_kwargs) + + +def main() -> int: + """Entrypoint for running FreeAgent against FreeEnv from the command line.""" + args = build_parser().parse_args() + config = load_app_config(args.config) + + logger = DebugGymLogger("free-agent-run") + + env_cfg = config["environment"] + terminal = resolve_terminal(env_cfg, logger) + # Copy only the knobs understood by FreeEnv, leaving unrelated config behind. + env_kwargs = dict( + image=env_cfg["image"], + terminal=terminal, + mount_path=env_cfg.get("mount_path"), + setup_commands=env_cfg.get("setup_commands"), + instructions=env_cfg.get("instructions"), + init_git=env_cfg.get("init_git", True), + workspace_dir=env_cfg.get("workspace_dir", "/testbed"), + logger=logger, + dir_tree_depth=env_cfg.get("dir_tree_depth", 2), + ) + + # Instantiate the environment once the terminal and core parameters are ready. + env = FreeEnv(**env_kwargs) + + tools_config = config.get("tools") + if not tools_config: + raise ValueError( + "Configuration must specify a non-empty 'tools' list for FreeEnv sessions." + ) + + add_tools(env, tools_config, logger) + + llm = build_llm(config, logger) + agent_config = config.get("agent", {}) + agent = FreeAgent(config=agent_config, env=env, llm=llm, logger=logger) + + task_name = config.get("task_name", "free-session") + + try: + resolved = agent.run(task_name=task_name) + agent.save_trajectory(task_name=task_name) + agent.save_patch(task_name=task_name) + logger.info(f"Run complete. Resolved={resolved}") + return 0 + finally: + env.close() + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/agents/conftest.py b/tests/agents/conftest.py index 4d89b746..9f0ac4d1 100644 --- a/tests/agents/conftest.py +++ b/tests/agents/conftest.py @@ -27,7 +27,7 @@ def agent_setup(tmp_path, open_data): def _length(text): return len(text) - def _agent_setup(agent_class): + def _agent_setup(agent_class, *, config_override=None): with ( patch("tiktoken.encoding_for_model") as mock_encoding_for_model, patch("os.path.exists", return_value=True), @@ -45,6 +45,8 @@ def _agent_setup(agent_class): "memory_size": 10, "random_seed": 42, } + if config_override: + config_dict.update(config_override) env = MagicMock() env.task_name = "test_task" llm = MagicMock() diff --git a/tests/agents/test_free_agent.py b/tests/agents/test_free_agent.py new file mode 100644 index 00000000..0bb9ffd2 --- /dev/null +++ b/tests/agents/test_free_agent.py @@ -0,0 +1,62 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from debug_gym.agents.base_agent import BaseAgent +from debug_gym.agents.free_agent import FreeAgent + + +@pytest.fixture +def make_free_agent(agent_setup): + def _factory(*, config_override=None): + agent, env, llm = next(agent_setup(FreeAgent, config_override=config_override)) + agent.logger = MagicMock() + return agent, env, llm + + return _factory + + +def test_free_agent_run_delegates_to_base(make_free_agent): + agent, _, _ = make_free_agent() + + with patch.object(BaseAgent, "run", return_value=True) as mock_run: + result = agent.run(task_name="demo", debug=True) + + mock_run.assert_called_once_with(task_name="demo", debug=True) + assert result is True + + +def test_free_agent_reraises_root_cause_for_missing_reset(make_free_agent): + agent, _, _ = make_free_agent() + + def side_effect(*args, **kwargs): + try: + raise RuntimeError("reset failed") + except RuntimeError as exc: # pragma: no cover - exercised below + raise AttributeError( + "'NoneType' object has no attribute 'max_score'" + ) from exc + + with patch.object(BaseAgent, "run", side_effect=side_effect): + with pytest.raises(RuntimeError) as excinfo: + agent.run(task_name="demo") + + assert str(excinfo.value) == "reset failed" + agent.logger.error.assert_called_once() + + +def test_free_agent_bubbles_unrelated_attribute_error(make_free_agent): + agent, _, _ = make_free_agent() + + with patch.object(BaseAgent, "run", side_effect=AttributeError("other")): + with pytest.raises(AttributeError, match="other"): + agent.run(task_name="demo") + + agent.logger.error.assert_not_called() + + +def test_free_agent_system_prompt_override(make_free_agent): + custom_prompt = "Inspect quietly." + agent, _, _ = make_free_agent(config_override={"system_prompt": custom_prompt}) + + assert agent.system_prompt == custom_prompt diff --git a/tests/gym/envs/test_free_env.py b/tests/gym/envs/test_free_env.py new file mode 100644 index 00000000..99fd3ecd --- /dev/null +++ b/tests/gym/envs/test_free_env.py @@ -0,0 +1,133 @@ +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock + +from debug_gym.gym.envs.free_env import FreeEnv +from debug_gym.gym.terminals.local import LocalTerminal +from debug_gym.gym.terminals.terminal import Terminal + + +class DummyTerminal(Terminal): + """Test helper terminal with minimal behavior for FreeEnv interactions.""" + + def __init__( + self, + *, + working_dir: str = "/tmp/test", + logger: Any | None = None, + base_image: str | None = None, + setup_commands: list[str] | None = None, + ): + super().__init__(working_dir=working_dir, logger=logger) + self.base_image = base_image + self.setup_commands = list(setup_commands or []) + self.closed = False + + def prepare_command(self, entrypoint): + return ["/bin/true"] + + def run(self, entrypoint, timeout=None, raises=False, strip_output=True): + if isinstance(entrypoint, str) and "tree" in entrypoint: + return True, "/workspace\n" + return True, "" + + @property + def default_shell_command(self): + return "/bin/true" + + def new_shell_session(self): + return None + + def copy_content(self, src, target=None): + return None + + def close(self): + self.closed = True + + +def test_free_env_defaults_to_local_terminal(): + logger = MagicMock() + + env = FreeEnv(image="ubuntu:22.04", logger=logger) + + assert isinstance(env.terminal, LocalTerminal) + assert env.container_image == "ubuntu:22.04" + + +def test_free_env_configures_existing_terminal(): + logger = MagicMock() + terminal_logger = MagicMock() + terminal = DummyTerminal( + working_dir="/initial", + logger=terminal_logger, + base_image="base", + setup_commands=["existing"], + ) + + env = FreeEnv( + image="ubuntu:22.04", + terminal=terminal, + setup_commands=["apt update"], + workspace_dir="/workspace", + logger=logger, + init_git=False, + ) + + env.reset() + + assert env.terminal is terminal + assert terminal.base_image == "ubuntu:22.04" + assert terminal.working_dir == "/workspace" + assert terminal.logger is logger + assert terminal.setup_commands == ["apt update"] + + +def test_free_env_respects_custom_workspace(tmp_path): + logger = MagicMock() + terminal = DummyTerminal(logger=logger) + + env = FreeEnv( + image="ubuntu:22.04", + terminal=terminal, + workspace_dir="/workspace", + logger=logger, + init_git=False, + ) + + env.reset() + + assert env.workspace.working_dir == Path("/workspace") + assert terminal.working_dir == "/workspace" + + +def test_free_env_reset_allows_dynamic_overrides(): + logger = MagicMock() + terminal = DummyTerminal(logger=logger, setup_commands=["initial"]) + + env = FreeEnv( + image="ubuntu:22.04", + terminal=terminal, + setup_commands=["initial"], + workspace_dir="/workspace", + logger=logger, + init_git=True, + ) + + env.reset( + options={ + "image": "ubuntu:24.04", + "workspace_dir": "/new", + "setup_commands": ["echo ready"], + "instructions": "Inspect carefully.", + "init_git": False, + } + ) + + assert env.container_image == "ubuntu:24.04" + assert env.instructions == "Inspect carefully." + assert env.init_git is False + assert env._workspace_dir == "/new" + assert terminal.working_dir == "/new" + assert terminal.setup_commands == ["echo ready"] + assert terminal.base_image == "ubuntu:24.04" + assert terminal.closed is True diff --git a/tests/gym/terminals/test_terminal.py b/tests/gym/terminals/test_terminal.py index 3867ea18..eb2ac54c 100644 --- a/tests/gym/terminals/test_terminal.py +++ b/tests/gym/terminals/test_terminal.py @@ -154,3 +154,37 @@ def test_select_terminal_unknown(): def test_select_terminal_invalid_config(): with pytest.raises(TypeError): select_terminal("not a dict") + + +def test_select_terminal_kubernetes_extra_labels(monkeypatch): + captured = {} + + class DummyK8s: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr( + "debug_gym.gym.terminals.KubernetesTerminal", + DummyK8s, + ) + + config = { + "type": "kubernetes", + "namespace": "example", + "extra_labels": {"foo": "bar"}, + "pod_spec_kwargs": {"tolerations": []}, + } + + terminal = select_terminal(config, uuid="1234") + + assert isinstance(terminal, DummyK8s) + assert captured["namespace"] == "example" + assert captured["pod_spec_kwargs"] == {"tolerations": []} + assert captured["extra_labels"] == {"foo": "bar", "uuid": "1234"} + assert "logger" in captured + assert config == { + "type": "kubernetes", + "namespace": "example", + "extra_labels": {"foo": "bar"}, + "pod_spec_kwargs": {"tolerations": []}, + }