Remove hardcoded CPU/Memory/GPU resources in SLURM launcher

Hossein Kavianihamedani · Hossein Kavianihamedani · commit 329cea353110 · 2025-11-17T09:59:47.000-08:00
- Add cpu, memory_mb, and gpus_per_node fields to LauncherConfig (provisioner level)
- Update Slurmlauncher to read from provisioner config or infer from SLURM env:
  * SLURM_CPUS_ON_NODE
  * SLURM_MEM_PER_NODE
  * SLURM_GPUS_PER_NODE / SLURM_GPUS_ON_NODE
- Update qwen3_32b.yaml with commented provisioner resource fields
- Backward compatible: reads from SLURM env if not in config, else fails with clear error
- Simple, clean implementation at provisioner level (not per-service/actor)
diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
@@ -12,6 +12,9 @@ off_by_n: 1 # Off by one by default
 
 provisioner:
   launcher: slurm
+  cpu:             # CPUs per node - if empty, will be inferred from SLURM
+  memory_mb:       # Memory in MB per node - if empty, will be inferred from SLURM
+  gpus_per_node:   # Number of GPUs per node - if empty, will be inferred from SLURM
 
 # Main loop configuration
 rollout_threads: 32 # make this 4x the number of policy replicas seems to work well
diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py
@@ -8,6 +8,7 @@
 
 import copy
 import getpass
+import logging
 import os
 import subprocess
 import tempfile
@@ -27,6 +28,8 @@
 from monarch.tools.commands import create, info
 from monarch.tools.config import Config, Workspace
 
+logger = logging.getLogger(__name__)
+
 _MAST_AVAILABLE = False
 
 try:
@@ -122,6 +125,67 @@ async def remote_setup(self, procs: ProcMesh) -> None:
 
 
 class Slurmlauncher(BaseLauncher):
+    def __init__(self, cfg: LauncherConfig | None = None):
+        self.cfg = cfg
+
+    def _infer_from_slurm_env(self) -> tuple[int | None, int | None, int | None]:
+        """Infer SLURM resources from environment variables."""
+        cpu = os.environ.get("SLURM_CPUS_ON_NODE")
+        mem = os.environ.get("SLURM_MEM_PER_NODE")
+        gpu = os.environ.get(
+            "SLURM_GPUS_PER_NODE", os.environ.get("SLURM_GPUS_ON_NODE")
+        )
+
+        if gpu and ":" in gpu:
+            gpu = gpu.split(":")[-1]
+
+        return (
+            int(cpu) if cpu else None,
+            int(mem) if mem else None,
+            int(gpu) if gpu else None,
+        )
+
+    def _get_resources(self) -> dict[str, int]:
+        """Get resource requirements from config or SLURM environment.
+
+        Priority: config values > SLURM environment variables > error
+        """
+        cpu_count = self.cfg.cpu if self.cfg else None
+        memory_mb = self.cfg.memory_mb if self.cfg else None
+        gpu_count = self.cfg.gpus_per_node if self.cfg else None
+
+        # Infer from SLURM environment variables if values are missing
+        if cpu_count is None or memory_mb is None or gpu_count is None:
+            inferred_cpu, inferred_mem, inferred_gpu = self._infer_from_slurm_env()
+
+            if cpu_count is None:
+                cpu_count = inferred_cpu
+            if memory_mb is None:
+                memory_mb = inferred_mem
+            if gpu_count is None:
+                gpu_count = inferred_gpu
+
+            if cpu_count and memory_mb and gpu_count:
+                logger.info(
+                    f"Inferred SLURM node resources from environment: "
+                    f"{cpu_count} CPUs, {memory_mb} MB memory, {gpu_count} GPUs"
+                )
+
+        # Validate we have all required resources
+        if cpu_count is None or memory_mb is None or gpu_count is None:
+            raise ValueError(
+                f"SLURM launcher requires cpu, memory_mb, and gpus_per_node. "
+                f"Add to provisioner config in YAML or run inside SLURM allocation. "
+                f"Got: cpu={cpu_count}, memory_mb={memory_mb}, gpus_per_node={gpu_count}"
+            )
+
+        logger.info(
+            f"Using SLURM node resources: "
+            f"{cpu_count} CPUs, {memory_mb} MB memory, {gpu_count} GPUs"
+        )
+
+        return {"cpu": cpu_count, "memory_mb": memory_mb, "gpu": gpu_count}
+
     async def initialize(self) -> None:
         # HostMesh currently requires explicit configuration
         # of the underlying transport from client to mesh.
@@ -132,12 +196,14 @@ async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str]
         appdef = hyperactor.host_mesh(
             image="test", meshes=[f"{name}:{num_hosts}:gpu.small"]
         )
+
+        # Get resources (same for all allocations)
+        resources = self._get_resources()
+
         for role in appdef.roles:
-            # Note - this is hardcoded to SLURM
-            # We got this with sinfo
-            role.resource.memMB = 2062607
-            role.resource.cpu = 128
-            role.resource.gpu = 8
+            role.resource.memMB = resources["memory_mb"]
+            role.resource.cpu = resources["cpu"]
+            role.resource.gpu = resources["gpu"]
 
         # Note - we cannot add in an empty workspace, so we create a fake temporary one
         temp_workspace = tempfile.mkdtemp(prefix="forge_workspace_")
diff --git a/src/forge/types.py b/src/forge/types.py
@@ -109,6 +109,9 @@ class LauncherConfig:
     job_name: str = ""
     services: dict[str, ServiceConfig] = field(default_factory=dict)
     actors: dict[str, ProcessConfig] = field(default_factory=dict)
+    cpu: int | None = None  # CPUs per node (required for SLURM)
+    memory_mb: int | None = None  # Memory in MB per node (required for SLURM)
+    gpus_per_node: int | None = None  # GPUs per node (required for SLURM)
 
     def __post_init__(self):
         if isinstance(self.launcher, str):