Skip to content

Commit 6de8115

Browse files
committed
Update Baskerivlle sbatch and srun scripts
Updates the scripts for queuing using sbatch or running within an interactive shell on a compute node using srun. The objective with these changes is to ensure: 1. Everything works simply by running the scripts. 2. There's an element of consistency across the various scripts.
1 parent c04e908 commit 6de8115

9 files changed

+32
-29
lines changed

train/batch/bask-local-diskbw.sh renamed to train/batch/bask-srun-diskbw.sh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
# vim: et:ts=4:sts=4:sw=4
33

44
# Execute using:
5-
# sbatch ./bask-local-diskbw.sh
5+
# srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
6+
# ./bask-srun-diskbw.sh
67

78
echo "## Aurora disk bandwidth script starting"
89

910
# Quit on error
10-
set -e
11+
# set -e
1112

1213
pushd ../scripts
1314

@@ -34,10 +35,8 @@ python3 -m venv venv
3435
. ./venv/bin/activate
3536

3637
pip install --quiet --upgrade pip
37-
pip install --quiet xarray==2023.1.0
3838
pip install --quiet dask==2025.5.1
39-
pip install --quiet typing-extensions==4.14.0
40-
pip install --quiet -e ../../aurora
39+
pip install --quiet ../../.[bask]
4140

4241
echo "## Running model"
4342

train/batch/bask-local-gpubw.sh renamed to train/batch/bask-srun-gpubw.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
# vim: et:ts=4:sts=4:sw=4
33

44
# Execute using:
5-
# sbatch ./bask-local-gpubw.sh
5+
# srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
6+
# ./bask-srun-gpubw.sh
67

78
echo "## Aurora GPU bandwidth script starting"
89

910
# Quit on error
10-
set -e
11+
# set -e
1112

1213
pushd ../scripts
1314

@@ -29,7 +30,7 @@ python3 -m venv venv
2930
. ./venv/bin/activate
3031

3132
pip install --quiet --upgrade pip
32-
pip install --quiet typing-extensions==4.14.0
33+
pip install --quiet ../../.[bask]
3334

3435
echo "## Running model"
3536

train/batch/bask-local-train-fsdp.sh renamed to train/batch/bask-srun-train-fsdp.sh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
# vim: et:ts=4:sts=4:sw=4
33

44
# Execute using:
5-
# sbatch ./bask-local-train-fsdp.sh
5+
# srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
6+
# ./bask-local-train-fsdp.sh
67

78
echo "## Aurora fine-tuning script starting"
89

@@ -30,13 +31,11 @@ export OMP_NUM_THREADS=1
3031

3132
echo "## Initialising virtual environment"
3233

33-
python -m venv venv
34+
python3 -m venv venv
3435
. ./venv/bin/activate
3536

3637
pip install --quiet --upgrade pip
37-
pip install --quiet xarray==2023.1.0
38-
pip install --quiet cdsapi
39-
pip install --quiet -e ../../aurora
38+
pip install --quiet ../../.[bask]
4039

4140
echo "## Running model"
4241

train/batch/bask-train-fsdp-1x1.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# vim: et:ts=4:sts=4:sw=4
33
#SBATCH --qos turing
44
#SBATCH --account usjs9456-ati-test
5-
#SBATCH --time 0:20:0
5+
#SBATCH --time 1:00:0
66
#SBATCH --nodes 1
77
#SBATCH --ntasks-per-node 1
88
#SBATCH --gpus-per-node 1
@@ -12,7 +12,7 @@
1212
#SBATCH --output results/one_node_one_gpu.txt
1313

1414
# Execute using:
15-
# sbatch ./bask-train-fsdp.sh
15+
# sbatch ./bask-train-fsdp-1x1.sh
1616

1717
echo
1818
echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242
export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
4343
export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
4444
export OMP_NUM_THREADS=1
45-
export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
45+
#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646

4747
echo
4848
echo "## Initialising virtual environment"

train/batch/bask-train-fsdp-1x4.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# vim: et:ts=4:sts=4:sw=4
33
#SBATCH --qos turing
44
#SBATCH --account usjs9456-ati-test
5-
#SBATCH --time 0:20:0
5+
#SBATCH --time 1:00:0
66
#SBATCH --nodes 1
77
#SBATCH --ntasks-per-node 1
88
#SBATCH --gpus-per-node 4
@@ -12,7 +12,7 @@
1212
#SBATCH --output results/one_node_four_gpus.txt
1313

1414
# Execute using:
15-
# sbatch ./bask-train-fsdp.sh
15+
# sbatch ./bask-train-fsdp-1x4.sh
1616

1717
echo
1818
echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242
export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
4343
export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
4444
export OMP_NUM_THREADS=1
45-
export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
45+
#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646

4747
echo
4848
echo "## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
6969
vmstat -t 1 -y > log-train-cpu.txt &
7070

7171
# Perform the prediction
72+
# Repeat this 4 times so we get better logs
7273
for i in {0..3}; do
7374
srun bash -c \
7475
'python -m torch.distributed.run \

train/batch/bask-train-fsdp-2x4.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# vim: et:ts=4:sts=4:sw=4
33
#SBATCH --qos turing
44
#SBATCH --account usjs9456-ati-test
5-
#SBATCH --time 0:20:0
5+
#SBATCH --time 1:00:0
66
#SBATCH --nodes 2
77
#SBATCH --ntasks-per-node 1
88
#SBATCH --gpus-per-node 2
@@ -12,7 +12,7 @@
1212
#SBATCH --output results/two_nodes_four_gpus.txt
1313

1414
# Execute using:
15-
# sbatch ./bask-train-fsdp.sh
15+
# sbatch ./bask-train-fsdp-2x4.sh
1616

1717
echo
1818
echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242
export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
4343
export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
4444
export OMP_NUM_THREADS=1
45-
export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
45+
#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646

4747
echo
4848
echo "## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
6969
vmstat -t 1 -y > log-train-cpu.txt &
7070

7171
# Perform the prediction
72+
# Repeat this 4 times so we get better logs
7273
for i in {0..3}; do
7374
srun bash -c \
7475
'python -m torch.distributed.run \

train/batch/bask-train-fsdp-2x8.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# vim: et:ts=4:sts=4:sw=4
33
#SBATCH --qos turing
44
#SBATCH --account usjs9456-ati-test
5-
#SBATCH --time 0:20:0
5+
#SBATCH --time 1:00:0
66
#SBATCH --nodes 2
77
#SBATCH --ntasks-per-node 1
88
#SBATCH --gpus-per-node 4
@@ -12,7 +12,7 @@
1212
#SBATCH --output results/two_nodes_eight_gpus.txt
1313

1414
# Execute using:
15-
# sbatch ./bask-train-fsdp.sh
15+
# sbatch ./bask-train-fsdp-2x8.sh
1616

1717
echo
1818
echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242
export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
4343
export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
4444
export OMP_NUM_THREADS=1
45-
export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
45+
#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646

4747
echo
4848
echo "## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
6969
vmstat -t 1 -y > log-train-cpu.txt &
7070

7171
# Perform the prediction
72+
# Repeat this 4 times so we get better logs
7273
for i in {0..3}; do
7374
srun bash -c \
7475
'python -m torch.distributed.run \

train/batch/bask-train-fsdp-4x4.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# vim: et:ts=4:sts=4:sw=4
33
#SBATCH --qos turing
44
#SBATCH --account usjs9456-ati-test
5-
#SBATCH --time 0:20:0
5+
#SBATCH --time 1:00:0
66
#SBATCH --nodes 4
77
#SBATCH --ntasks-per-node 1
88
#SBATCH --gpus-per-node 1
@@ -12,7 +12,7 @@
1212
#SBATCH --output results/four_nodes_one_gpu.txt
1313

1414
# Execute using:
15-
# sbatch ./bask-train-fsdp.sh
15+
# sbatch ./bask-train-fsdp-4x4.sh
1616

1717
echo
1818
echo "## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242
export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
4343
export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
4444
export OMP_NUM_THREADS=1
45-
export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
45+
#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646

4747
echo
4848
echo "## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
6969
vmstat -t 1 -y > log-train-cpu.txt &
7070

7171
# Perform the prediction
72+
# Repeat this 4 times so we get better logs
7273
for i in {0..3}; do
7374
srun bash -c \
7475
'python -m torch.distributed.run \

train/batch/bask-train-fsdp.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ echo "## Configuring environment"
4343
export PRIMARY_PORT=$((16384 + $RANDOM % 16384))
4444
export PRIMARY_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
4545
export OMP_NUM_THREADS=1
46-
export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
46+
#export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4747

4848
echo
4949
echo "## Initialising virtual environment"

0 commit comments

Comments
 (0)