Merge pull request #48 from alan-turing-institute/train_timing_flat_bask

llewelld · web-flow · commit 5c8a128afcd1 · 2025-09-12T16:39:21.000+01:00
Add support for running different enc/decoder depths on Baskerville
diff --git a/train/batch/flat_vs_composite/bask-train-ddp-1x1.sh b/train/batch/flat_vs_composite/bask-train-ddp-1x1.sh
@@ -0,0 +1,90 @@
+#!/bin/bash -l
+# vim: et:ts=4:sts=4:sw=4
+#SBATCH --qos turing
+#SBATCH --account usjs9456-ati-test
+#SBATCH --time 1:00:0
+#SBATCH --nodes 1
+#SBATCH --ntasks-per-node 1
+#SBATCH --gpus-per-node 1
+#SBATCH --mem 0
+#SBATCH --constraint=a100_80
+#SBATCH --job-name aurora-train
+#SBATCH --output bask-encoder-%a.txt
+
+# Execute using:
+# sbatch --array=5-29 ./bask-train-ddp-1x1.sh
+
+# 1 node, 1 GPU
+# For this we don't need to 'skip' any GPUs
+
+#set -o xtrace
+set -o errexit
+
+pushd ../../scripts
+
+if [ ! -d ../../downloads ]; then
+  echo "Please run the batch-download.sh script to download the data."
+  exit 1
+fi
+
+echo
+echo "## Loading modules"
+
+module -q purge
+module -q load baskerville
+module -q load bask-apps/live
+module -q load PyTorch/2.0.1-foss-2022a-CUDA-11.7.0
+module -q load torchvision/0.15.2-foss-2022a-CUDA-11.7.0
+
+echo
+echo "## Configuring environment"
+
+export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
+export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export OMP_NUM_THREADS=1
+export ENCODER_DEPTH=${SLURM_ARRAY_TASK_ID}
+
+echo
+echo "## Initialising virtual environment"
+
+python -m venv venv
+. ./venv/bin/activate
+
+pip install --quiet --upgrade pip
+pip install --quiet ../../.[bask]
+
+echo
+echo "## Details"
+echo
+echo "Nodes: ${SLURM_JOB_NUM_NODES}"
+echo "GPUs per node: ${SLURM_GPUS_PER_NODE}"
+echo "Primary address: ${PRIMARY_ADDR}"
+echo "Primary port: ${PRIMARY_PORT}"
+echo "Encoder depth: ${ENCODER_DEPTH}"
+
+echo
+echo "## Running model"
+
+# Track GPU and CPU metrics
+#nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
+#vmstat -t 1 -y > log-train-cpu.txt &
+
+# Perform the prediction
+# Repeat this 4 times so we get better logs
+srun bash -c \
+    'python -m torch.distributed.run \
+    --nnodes ${SLURM_JOB_NUM_NODES} \
+    --nproc-per-node ${SLURM_GPUS_PER_NODE} \
+    --master_addr ${PRIMARY_ADDR} \
+    --master_port ${PRIMARY_PORT} \
+    --node_rank ${SLURM_NODEID} \
+    train_ed.py \
+    --download_path ../../downloads \
+    --encoders ${ENCODER_DEPTH} \
+    --grad_accum 8'
+
+echo
+echo "## Tidying up"
+
+deactivate
+popd
diff --git a/train/scripts/train_ed.py b/train/scripts/train_ed.py
@@ -31,6 +31,20 @@
     help="path to download directory",
     default="../../era5/era_v_inf",
 )
+parser.add_argument(
+    "--encoders",
+    "-e",
+    type=int,
+    help="encoder/decoder depth",
+    default=12,
+)
+parser.add_argument(
+    "--grad_accum",
+    "-g",
+    type=int,
+    help="gradient accumulation steps; must be a multiple of the world size",
+    default=1,
+)
 args = parser.parse_args()
 
 if args.xpu:
@@ -76,8 +90,9 @@
     RANK = int(os.environ["RANK"])
     LOCAL_RANK = int(os.environ["LOCAL_RANK"])
 
+assert args.grad_accum % WORLD_SIZE == 0
 
-def main(download_path: str, xpu: bool = False):
+def main(download_path: str, encoder_depth: int, xpu: bool = False):
     if xpu:
         comms_backend = "ccl"
         device_type = "xpu"
@@ -102,15 +117,17 @@ def main(download_path: str, xpu: bool = False):
     model = Aurora(
         use_lora=False,  # Model was not fine-tuned.
         autocast=True,  # Use AMP.
-        encoder_depths=(12, 12, 12),
+        encoder_depths=(encoder_depth, encoder_depth, encoder_depth),
         encoder_num_heads=(4, 8, 16),
-        decoder_depths=(12, 12, 12),
+        decoder_depths=(encoder_depth, encoder_depth, encoder_depth),
         decoder_num_heads=(16, 8, 4),
         embed_dim=256,
         num_heads=8,
     )
     # can no longer load checkpoint as we have different model size
     # model.load_checkpoint("microsoft/aurora", "aurora-0.25-pretrained.ckpt")
+    if not xpu:
+        torch.cuda.set_device(LOCAL_RANK)
 
     download_path = Path(download_path)
 
@@ -147,6 +164,8 @@ def main(download_path: str, xpu: bool = False):
 
     times = []
 
+    n_batches_per_optim = args.grad_accum // WORLD_SIZE
+
     time_start = time.time()
     for batch, (X, y) in enumerate(data_loader):
         print(f"batch {batch}...", flush=True)
@@ -170,8 +189,9 @@ def main(download_path: str, xpu: bool = False):
         print("performing backward pass...", flush=True)
         loss.backward()
 
-        print("optimizing...", flush=True)
-        optimizer.step()
+        if batch % n_batches_per_optim == 0:
+            print("optimizing...")
+            optimizer.step()
 
         time_end = time.time()
         times.append(time_end - time_start)
@@ -185,6 +205,9 @@ def main(download_path: str, xpu: bool = False):
         avg_time = sum([sum(t[1:]) for t in gathered_times]) / sum(
             [len(times[1:]) for t in gathered_times]
         )
+        print(
+            f"Encoder/decoder depth: ({encoder_depth}, {encoder_depth}, {encoder_depth})", flush=True
+        )
         print(
             f"Average time per epoch (ignoring first): {avg_time} seconds", flush=True
         )
@@ -206,4 +229,4 @@ def main(download_path: str, xpu: bool = False):
     print("done", flush=True)
 
 
-main(args.download_path, xpu=args.xpu)
+main(args.download_path, args.encoders, xpu=args.xpu)