Update Baskerivlle sbatch and srun scripts

llewelld · llewelld · commit 6de81154b71e · 2025-08-13T18:26:32.000+01:00
Updates the scripts for queuing using sbatch or running within an interactive
shell on a compute node using srun.

The objective with these changes is to ensure:

1. Everything works simply by running the scripts.
2. There's an element of consistency across the various scripts.
diff --git a/train/batch/bask-srun-diskbw.sh b/train/batch/bask-srun-diskbw.sh
@@ -2,12 +2,13 @@
 # vim: et:ts=4:sts=4:sw=4
 
 # Execute using:
-# sbatch ./bask-local-diskbw.sh
+# srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
+# ./bask-srun-diskbw.sh
 
 echo "## Aurora disk bandwidth script starting"
 
 # Quit on error
-set -e
+# set -e
 
 pushd ../scripts
 
@@ -34,10 +35,8 @@ python3 -m venv venv
 . ./venv/bin/activate
 
 pip install --quiet --upgrade pip
-pip install --quiet xarray==2023.1.0
 pip install --quiet dask==2025.5.1
-pip install --quiet typing-extensions==4.14.0
-pip install --quiet -e ../../aurora
+pip install --quiet ../../.[bask]
 
 echo "## Running model"
 
diff --git a/train/batch/bask-srun-gpubw.sh b/train/batch/bask-srun-gpubw.sh
@@ -2,12 +2,13 @@
 # vim: et:ts=4:sts=4:sw=4
 
 # Execute using:
-# sbatch ./bask-local-gpubw.sh
+# srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
+# ./bask-srun-gpubw.sh
 
 echo "## Aurora GPU bandwidth script starting"
 
 # Quit on error
-set -e
+# set -e
 
 pushd ../scripts
 
@@ -29,7 +30,7 @@ python3 -m venv venv
 . ./venv/bin/activate
 
 pip install --quiet --upgrade pip
-pip install --quiet typing-extensions==4.14.0
+pip install --quiet ../../.[bask]
 
 echo "## Running model"
 
diff --git a/train/batch/bask-srun-train-fsdp.sh b/train/batch/bask-srun-train-fsdp.sh
@@ -2,7 +2,8 @@
 # vim: et:ts=4:sts=4:sw=4
 
 # Execute using:
-# sbatch ./bask-local-train-fsdp.sh
+# srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
+# ./bask-local-train-fsdp.sh
 
 echo "## Aurora fine-tuning script starting"
 
@@ -30,13 +31,11 @@ export OMP_NUM_THREADS=1
 
 echo "## Initialising virtual environment"
 
-python -m venv venv
+python3 -m venv venv
 . ./venv/bin/activate
 
 pip install --quiet --upgrade pip
-pip install --quiet xarray==2023.1.0
-pip install --quiet cdsapi
-pip install --quiet -e ../../aurora
+pip install --quiet ../../.[bask]
 
 echo "## Running model"
 
diff --git a/train/batch/bask-train-fsdp-1x1.sh b/train/batch/bask-train-fsdp-1x1.sh
@@ -2,7 +2,7 @@
 # vim: et:ts=4:sts=4:sw=4
 #SBATCH --qos turing
 #SBATCH --account usjs9456-ati-test
-#SBATCH --time 0:20:0
+#SBATCH --time 1:00:0
 #SBATCH --nodes 1
 #SBATCH --ntasks-per-node 1
 #SBATCH --gpus-per-node 1
@@ -12,7 +12,7 @@
 #SBATCH --output results/one_node_one_gpu.txt
 
 # Execute using:
-# sbatch ./bask-train-fsdp.sh
+# sbatch ./bask-train-fsdp-1x1.sh
 
 echo
 echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
 export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
 export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export OMP_NUM_THREADS=1
-export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
+#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
 
 echo
 echo "## Initialising virtual environment"
diff --git a/train/batch/bask-train-fsdp-1x4.sh b/train/batch/bask-train-fsdp-1x4.sh
@@ -2,7 +2,7 @@
 # vim: et:ts=4:sts=4:sw=4
 #SBATCH --qos turing
 #SBATCH --account usjs9456-ati-test
-#SBATCH --time 0:20:0
+#SBATCH --time 1:00:0
 #SBATCH --nodes 1
 #SBATCH --ntasks-per-node 1
 #SBATCH --gpus-per-node 4
@@ -12,7 +12,7 @@
 #SBATCH --output results/one_node_four_gpus.txt
 
 # Execute using:
-# sbatch ./bask-train-fsdp.sh
+# sbatch ./bask-train-fsdp-1x4.sh
 
 echo
 echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
 export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
 export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export OMP_NUM_THREADS=1
-export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
+#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
 
 echo
 echo "## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
 vmstat -t 1 -y > log-train-cpu.txt &
 
 # Perform the prediction
+# Repeat this 4 times so we get better logs
 for i in {0..3}; do
     srun bash -c \
         'python -m torch.distributed.run \
diff --git a/train/batch/bask-train-fsdp-2x4.sh b/train/batch/bask-train-fsdp-2x4.sh
@@ -2,7 +2,7 @@
 # vim: et:ts=4:sts=4:sw=4
 #SBATCH --qos turing
 #SBATCH --account usjs9456-ati-test
-#SBATCH --time 0:20:0
+#SBATCH --time 1:00:0
 #SBATCH --nodes 2
 #SBATCH --ntasks-per-node 1
 #SBATCH --gpus-per-node 2
@@ -12,7 +12,7 @@
 #SBATCH --output results/two_nodes_four_gpus.txt
 
 # Execute using:
-# sbatch ./bask-train-fsdp.sh
+# sbatch ./bask-train-fsdp-2x4.sh
 
 echo
 echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
 export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
 export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export OMP_NUM_THREADS=1
-export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
+#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
 
 echo
 echo "## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
 vmstat -t 1 -y > log-train-cpu.txt &
 
 # Perform the prediction
+# Repeat this 4 times so we get better logs
 for i in {0..3}; do
     srun bash -c \
         'python -m torch.distributed.run \
diff --git a/train/batch/bask-train-fsdp-2x8.sh b/train/batch/bask-train-fsdp-2x8.sh
@@ -2,7 +2,7 @@
 # vim: et:ts=4:sts=4:sw=4
 #SBATCH --qos turing
 #SBATCH --account usjs9456-ati-test
-#SBATCH --time 0:20:0
+#SBATCH --time 1:00:0
 #SBATCH --nodes 2
 #SBATCH --ntasks-per-node 1
 #SBATCH --gpus-per-node 4
@@ -12,7 +12,7 @@
 #SBATCH --output results/two_nodes_eight_gpus.txt
 
 # Execute using:
-# sbatch ./bask-train-fsdp.sh
+# sbatch ./bask-train-fsdp-2x8.sh
 
 echo
 echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
 export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
 export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export OMP_NUM_THREADS=1
-export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
+#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
 
 echo
 echo "## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
 vmstat -t 1 -y > log-train-cpu.txt &
 
 # Perform the prediction
+# Repeat this 4 times so we get better logs
 for i in {0..3}; do
     srun bash -c \
         'python -m torch.distributed.run \
diff --git a/train/batch/bask-train-fsdp-4x4.sh b/train/batch/bask-train-fsdp-4x4.sh
@@ -2,7 +2,7 @@
 # vim: et:ts=4:sts=4:sw=4
 #SBATCH --qos turing
 #SBATCH --account usjs9456-ati-test
-#SBATCH --time 0:20:0
+#SBATCH --time 1:00:0
 #SBATCH --nodes 4
 #SBATCH --ntasks-per-node 1
 #SBATCH --gpus-per-node 1
@@ -12,7 +12,7 @@
 #SBATCH --output results/four_nodes_one_gpu.txt
 
 # Execute using:
-# sbatch ./bask-train-fsdp.sh
+# sbatch ./bask-train-fsdp-4x4.sh
 
 echo
 echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
 export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
 export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export OMP_NUM_THREADS=1
-export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
+#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
 
 echo
 echo "## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
 vmstat -t 1 -y > log-train-cpu.txt &
 
 # Perform the prediction
+# Repeat this 4 times so we get better logs
 for i in {0..3}; do
     srun bash -c \
         'python -m torch.distributed.run \
diff --git a/train/batch/bask-train-fsdp.sh b/train/batch/bask-train-fsdp.sh
@@ -43,7 +43,7 @@ echo "## Configuring environment"
 export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
 export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export OMP_NUM_THREADS=1
-export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
+#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
 
 echo
 echo "## Initialising virtual environment"