NVIDIA-NeMo · akoumpa · Oct 4, 2025
@@ -18,8 +18,9 @@
 import os
 import time
 from pathlib import Path
-
 import yaml
+import json, subprocess, tempfile, wandb
+from pathlib import Path
 
 logging.getLogger().setLevel(logging.INFO)
 
@@ -160,6 +161,25 @@ def launch_with_slurm(args, job_conf_path, job_dir, slurm_config, extra_args=Non
         slurm_config["extra_mounts"].append(VolumeMapping(Path(repo_root), Path(repo_root)))
     return submit_slurm_job(SlurmConfig(**slurm_config, command=command, chdir=repo_root), job_dir)
 
+def _download_env_artifact(run_url: str) -> Path:
+    api = wandb.Api()
+    run = api.from_path(run_url)
+    arts = [a for a in run.logged_artifacts() if a.type == "environment"]
+    art = arts[-1]
+    return Path(art.download())
+
+def suggest_docker_cmd(env_dir: Path, extra_mount: str | None = None) -> str | None:
+    docker_json = env_dir / "docker.json"
+    if not docker_json.exists(): return None
+    dj = json.loads(docker_json.read_text())
+    ref = dj.get("digest_ref") or dj.get("ref_env")
+    if not ref: return None
+    mounts = [
+        "-v \"$(pwd)\":\"/workspace\"",
+        f"-v {extra_mount}:/extra:ro" if extra_mount else "",
+        "--gpus all" if shutil.which("nvidia-smi") else "",
+    ]
+    return f"docker run --rm -it {' '.join(m for m in mounts if m)} {ref} bash"
 
 def build_parser() -> argparse.ArgumentParser:
     """
@@ -177,22 +197,30 @@ def build_parser() -> argparse.ArgumentParser:
         choices=["finetune", "pretrain", "kd"],
         help="Command within the domain (e.g., finetune, pretrain, kd, etc)",
     )
-    parser.add_argument(
+    subparser = parser.add_subparsers(help='Job launch from YAML config')
+    subparser.add_argument(
         "domain",
         metavar="<domain>",
         choices=["llm", "vlm"],
         help="Domain to operate on (e.g., LLM, VLM, etc)",
     )
 
     # Optional/required flag
-    parser.add_argument(
+    subparser.add_argument(
         "-c",
         "--config",
         metavar="PATH",
         type=Path,
         required=True,
         help="Path to YAML configuration file",
     )
+    subparser = parser.add_subparsers(help='Job launch from W&B run')
+    subparser.add_argument(
+        "run_url",
+        metavar="RUN_URL",
+        help="W&B run URL",
+    )
+
     # This is defined in torch.distributed.run's parser, but we also define it here.
     # We want to determine if the user passes `--nproc-per-node` via CLI. In particular, we
     # want to use this information to determine whether they want to utilize a subset of the
@@ -263,7 +291,6 @@ def run_interactive(args):
             torchrun_args.nproc_per_node = num_devices
         return thrun(torchrun_args)
 
-
 def main():
     """CLI for running finetune jobs with NeMo-Automodel, supporting torchrun, Slurm & Kubernetes.
 
@@ -274,6 +301,11 @@ def main():
         int: Job's status code
     """
     args, extra = build_parser().parse_known_args()
+    if args.run_url:
+        env_dir = _download_env_artifact(args.run_url)
+        cmd = suggest_docker_cmd(env_dir)
+        raise NotImplementedError("Launching job from W&B run is not implemented yet")
+
     logging.info(f"Domain:  {args.domain}")
     logging.info(f"Command: {args.command}")
     logging.info(f"Config:  {args.config.resolve()}")

@@ -0,0 +1,171 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+import json, os, platform, shutil, subprocess, sys, tempfile
+from pathlib import Path
+from typing import Dict, Any, Optional
+
+try:
+    import wandb
+except Exception:  # pragma: no cover
+    wandb = None  # noqa
+
+def _run(cmd: list[str]) -> str:
+    try:
+        return subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)
+    except Exception as e:
+        return f"<<{cmd} failed: {e}>>"
+
+def _write(path: Path, text: str) -> Optional[Path]:
+    try:
+        path.write_text(text)
+        return path
+    except Exception:
+        return None
+
+def _git_root(start: Path) -> Path:
+    out = _run(["git", "rev-parse", "--show-toplevel"])
+    p = Path(out.strip())
+    return p if p.exists() else start
+
+def _detect_docker_context() -> Dict[str, Any]:
+    """
+    Best-effort capture of the image reference and digest.
+    Works when:
+      - WANDB_DOCKER / AUTOMODEL_DOCKER_IMAGE / DOCKER_IMAGE is set
+      - or docker/podman/nerdctl is present and can inspect the ref
+    """
+    info: Dict[str, Any] = {
+        "ref_env": os.getenv("WANDB_DOCKER")
+                   or os.getenv("AUTOMODEL_DOCKER_IMAGE")
+                   or os.getenv("DOCKER_IMAGE"),
+        "engine": None,
+        "digest_ref": None,
+        "raw_inspect": None,
+        "os_release": None,
+        "container_runtime_guess": None,
+    }
+
+    # quick OS fingerprint (useful if reproducing on a different base image)
+    try:
+        info["os_release"] = Path("/etc/os-release").read_text()
+    except Exception:
+        pass
+
+    # try to guess runtime from env files commonly present in containers
+    if Path("/run/.containerenv").exists():
+        info["container_runtime_guess"] = "podman"
+    elif Path("/.dockerenv").exists():
+        info["container_runtime_guess"] = "docker"
+
+    # find a client we can call
+    for cand in ("docker", "podman", "nerdctl"):
+        if shutil.which(cand):
+            info["engine"] = cand
+            break
+
+    ref = info["ref_env"]
+    if ref and info["engine"]:
+        if info["engine"] in ("docker", "podman"):
+            fmt = "{{index .RepoDigests 0}}"
+            info["digest_ref"] = _run([info["engine"], "image", "inspect", ref, "--format", fmt]).strip()
+            info["raw_inspect"] = _run([info["engine"], "image", "inspect", ref])
+        elif info["engine"] == "nerdctl":
+            out = _run(["nerdctl", "image", "inspect", ref])
+            info["raw_inspect"] = out
+            # RepoDigests appears in JSON; keep it simple and let the replicate tool parse later
+            # (we still save the raw inspect for exact provenance)
+    return info
+
+SAFE_ENV_PREFIXES = (
+    "CUDA_", "CUDNN_", "NCCL_", "TRANSFORMERS_CACHE", "HF_HOME",
+    "UV_", "PIP_", "PYTORCH_", "WANDB_", "NEMO_", "WORLD_SIZE", "RANK", "LOCAL_RANK",
+)
+
+def log_environment_bundle(run: "wandb.sdk.wandb_run.Run",
+                           project_root: Path | None = None,
+                           artifact_name: str = "runtime-env") -> None:
+    """
+    Capture env + docker + resolver state into a W&B artifact tied to the run.
+    Safe to call on every entrypoint before heavy work starts.
+    """
+    assert wandb is not None, "wandb must be importable"
+    root = (project_root or _git_root(Path.cwd())).resolve()
+
+    files: list[Path] = []
+    tmp = Path(tempfile.mkdtemp(prefix="automodel-env-"))
+
+    # 1) system & python metadata
+    meta = {
+        "python": sys.version,
+        "executable": sys.executable,
+        "platform": platform.platform(),
+        "uname": platform.uname()._asdict(),
+        "pip_version": _run([sys.executable, "-m", "pip", "--version"]),
+        "pip_config_debug": _run([sys.executable, "-m", "pip", "config", "debug"]),
+        "nvidia_smi": _run(["bash", "-lc", "nvidia-smi -x -q || nvidia-smi || true"]),
+        "nvcc": _run(["bash", "-lc", "nvcc --version || true"]),
+        "git_commit": _run(["git", "rev-parse", "HEAD"]).strip(),
+        "git_status": _run(["git", "status", "--porcelain"]),
+        # whitelist non-secret env only
+        "env": {k: v for k, v in os.environ.items() if any(k.startswith(p) for p in SAFE_ENV_PREFIXES)},
+    }
+    p = _write(tmp / "env-metadata.json", json.dumps(meta, indent=2))
+    if p: files.append(p)
+
+    # 2) docker context
+    docker_info = _detect_docker_context()
+    p = _write(tmp / "docker.json", json.dumps(docker_info, indent=2))
+    if p: files.append(p)
+
+    # 3) resolver snapshots
+    # Prefer uv if present (even if we launched with plain python)
+    if (root / "pyproject.toml").exists() and shutil.which("uv"):
+        for name in ("pyproject.toml", "uv.lock"):
+            f = root / name
+            if f.exists(): files.append(f)
+        p = _write(tmp / "pip-freeze.txt", _run(["uv", "pip", "freeze"]))
+        if p: files.append(p)
+        p = _write(tmp / "requirements-uv.txt", _run(["uv", "export", "--frozen", "--format", "requirements-txt"]))
+        if p: files.append(p)
+    else:
+        p = _write(tmp / "pip-freeze.txt", _run([sys.executable, "-m", "pip", "freeze"]))
+        if p: files.append(p)
+        # Conda (optional)
+        if os.environ.get("CONDA_PREFIX") or shutil.which("conda"):
+            p = _write(tmp / "conda-env.yml", _run(["conda", "env", "export"]))
+            if p: files.append(p)
+
+    # 4) include project files if present
+    for fname in ("requirements.txt", "requirements-dev.txt", "pyproject.toml", "setup.cfg", "setup.py"):
+        f = root / fname
+        if f.exists():
+            files.append(f)
+
+    # 5) ship artifact
+    art = wandb.Artifact(artifact_name, type="environment")
+    for f in files:
+        try:
+            art.add_file(str(f))
+        except Exception:
+            pass
+    run.log_artifact(art)
+
+    # Also pin as a run summary for quick access
+    run.summary["environment_artifact"] = art.id
+    if docker_info.get("digest_ref"):
+        run.summary["docker_image"] = docker_info["digest_ref"]
+    elif docker_info.get("ref_env"):
+        run.summary["docker_image"] = docker_info["ref_env"]
@@ -55,6 +55,7 @@
 from nemo_automodel.components.distributed.utils import FirstRankPerNode, get_sync_ctx
 from nemo_automodel.components.loggers.log_utils import setup_logging
 from nemo_automodel.components.loggers.wandb_utils import suppress_wandb_log_messages
+from nemo_automodel.components.loggers.wandb_env import log_environment_bundle
 from nemo_automodel.components.loss.linear_ce import FusedLinearCrossEntropy
 from nemo_automodel.components.loss.masked_ce import MaskedCrossEntropy
 from nemo_automodel.components.optim.scheduler import OptimizerParamScheduler
@@ -593,6 +594,7 @@ def build_wandb(cfg) -> wandb.Run:
     Returns:
         The wandb instance.
     """
+    import wandb_workspaces.reports.v2 as wr
     assert cfg.get("wandb", None) is not None
     kwargs = cfg.wandb.to_dict()
     if kwargs.get("name", "") == "":
@@ -602,6 +604,24 @@ def build_wandb(cfg) -> wandb.Run:
         config=cfg.to_dict(),
         settings=Settings(silent=True),
     )
+    log_environment_bundle(run)
+
+    cmd = f"automodel reproduce --run-url {run.url}"  # whatever you want users to copy
+
+    report = wr.Report(
+        project=run.project,            # same project as the run
+        title=f"Reproduce: {run.name}",
+        description="One-click copy command to reproduce this run"
+    )
+    report.blocks = [
+        wr.H3(text="Copy & run this:"),
+        wr.CodeBlock(code=[cmd], language="bash"),   # renders with a copy button in the UI
+    ]
+    report.save()
+
+    # surface it from the run page too
+    run.summary["reproduce_cmd"] = cmd
+    run.summary["reproduce_report_url"] = report.url
     return run