File tree Expand file tree Collapse file tree 9 files changed +32
-29
lines changed
Expand file tree Collapse file tree 9 files changed +32
-29
lines changed Original file line number Diff line number Diff line change 22# vim: et:ts=4:sts=4:sw=4
33
44# Execute using:
5- # sbatch ./bask-local-diskbw.sh
5+ # srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
6+ # ./bask-srun-diskbw.sh
67
78echo " ## Aurora disk bandwidth script starting"
89
910# Quit on error
10- set -e
11+ # set -e
1112
1213pushd ../scripts
1314
@@ -34,10 +35,8 @@ python3 -m venv venv
3435. ./venv/bin/activate
3536
3637pip install --quiet --upgrade pip
37- pip install --quiet xarray==2023.1.0
3838pip install --quiet dask==2025.5.1
39- pip install --quiet typing-extensions==4.14.0
40- pip install --quiet -e ../../aurora
39+ pip install --quiet ../../.[bask]
4140
4241echo " ## Running model"
4342
Original file line number Diff line number Diff line change 22# vim: et:ts=4:sts=4:sw=4
33
44# Execute using:
5- # sbatch ./bask-local-gpubw.sh
5+ # srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
6+ # ./bask-srun-gpubw.sh
67
78echo " ## Aurora GPU bandwidth script starting"
89
910# Quit on error
10- set -e
11+ # set -e
1112
1213pushd ../scripts
1314
@@ -29,7 +30,7 @@ python3 -m venv venv
2930. ./venv/bin/activate
3031
3132pip install --quiet --upgrade pip
32- pip install --quiet typing-extensions==4.14.0
33+ pip install --quiet ../../.[bask]
3334
3435echo " ## Running model"
3536
Original file line number Diff line number Diff line change 22# vim: et:ts=4:sts=4:sw=4
33
44# Execute using:
5- # sbatch ./bask-local-train-fsdp.sh
5+ # srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
6+ # ./bask-local-train-fsdp.sh
67
78echo " ## Aurora fine-tuning script starting"
89
@@ -30,13 +31,11 @@ export OMP_NUM_THREADS=1
3031
3132echo " ## Initialising virtual environment"
3233
33- python -m venv venv
34+ python3 -m venv venv
3435. ./venv/bin/activate
3536
3637pip install --quiet --upgrade pip
37- pip install --quiet xarray==2023.1.0
38- pip install --quiet cdsapi
39- pip install --quiet -e ../../aurora
38+ pip install --quiet ../../.[bask]
4039
4140echo " ## Running model"
4241
Original file line number Diff line number Diff line change 22# vim: et:ts=4:sts=4:sw=4
33# SBATCH --qos turing
44# SBATCH --account usjs9456-ati-test
5- # SBATCH --time 0:20 :0
5+ # SBATCH --time 1:00 :0
66# SBATCH --nodes 1
77# SBATCH --ntasks-per-node 1
88# SBATCH --gpus-per-node 1
1212# SBATCH --output results/one_node_one_gpu.txt
1313
1414# Execute using:
15- # sbatch ./bask-train-fsdp.sh
15+ # sbatch ./bask-train-fsdp-1x1 .sh
1616
1717echo
1818echo " ## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242export PRIMARY_PORT=$(( 16384 + $RANDOM % 16384 ))
4343export PRIMARY_ADDR=$( scontrol show hostnames " $SLURM_JOB_NODELIST " | head -n 1)
4444export OMP_NUM_THREADS=1
45- export HF_HOME=" /bask/projects/u/usjs9456-ati-test/"
45+ # export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646
4747echo
4848echo " ## Initialising virtual environment"
Original file line number Diff line number Diff line change 22# vim: et:ts=4:sts=4:sw=4
33# SBATCH --qos turing
44# SBATCH --account usjs9456-ati-test
5- # SBATCH --time 0:20 :0
5+ # SBATCH --time 1:00 :0
66# SBATCH --nodes 1
77# SBATCH --ntasks-per-node 1
88# SBATCH --gpus-per-node 4
1212# SBATCH --output results/one_node_four_gpus.txt
1313
1414# Execute using:
15- # sbatch ./bask-train-fsdp.sh
15+ # sbatch ./bask-train-fsdp-1x4 .sh
1616
1717echo
1818echo " ## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242export PRIMARY_PORT=$(( 16384 + $RANDOM % 16384 ))
4343export PRIMARY_ADDR=$( scontrol show hostnames " $SLURM_JOB_NODELIST " | head -n 1)
4444export OMP_NUM_THREADS=1
45- export HF_HOME=" /bask/projects/u/usjs9456-ati-test/"
45+ # export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646
4747echo
4848echo " ## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
6969vmstat -t 1 -y > log-train-cpu.txt &
7070
7171# Perform the prediction
72+ # Repeat this 4 times so we get better logs
7273for i in {0..3}; do
7374 srun bash -c \
7475 ' python -m torch.distributed.run \
Original file line number Diff line number Diff line change 22# vim: et:ts=4:sts=4:sw=4
33# SBATCH --qos turing
44# SBATCH --account usjs9456-ati-test
5- # SBATCH --time 0:20 :0
5+ # SBATCH --time 1:00 :0
66# SBATCH --nodes 2
77# SBATCH --ntasks-per-node 1
88# SBATCH --gpus-per-node 2
1212# SBATCH --output results/two_nodes_four_gpus.txt
1313
1414# Execute using:
15- # sbatch ./bask-train-fsdp.sh
15+ # sbatch ./bask-train-fsdp-2x4 .sh
1616
1717echo
1818echo " ## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242export PRIMARY_PORT=$(( 16384 + $RANDOM % 16384 ))
4343export PRIMARY_ADDR=$( scontrol show hostnames " $SLURM_JOB_NODELIST " | head -n 1)
4444export OMP_NUM_THREADS=1
45- export HF_HOME=" /bask/projects/u/usjs9456-ati-test/"
45+ # export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646
4747echo
4848echo " ## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
6969vmstat -t 1 -y > log-train-cpu.txt &
7070
7171# Perform the prediction
72+ # Repeat this 4 times so we get better logs
7273for i in {0..3}; do
7374 srun bash -c \
7475 ' python -m torch.distributed.run \
Original file line number Diff line number Diff line change 22# vim: et:ts=4:sts=4:sw=4
33# SBATCH --qos turing
44# SBATCH --account usjs9456-ati-test
5- # SBATCH --time 0:20 :0
5+ # SBATCH --time 1:00 :0
66# SBATCH --nodes 2
77# SBATCH --ntasks-per-node 1
88# SBATCH --gpus-per-node 4
1212# SBATCH --output results/two_nodes_eight_gpus.txt
1313
1414# Execute using:
15- # sbatch ./bask-train-fsdp.sh
15+ # sbatch ./bask-train-fsdp-2x8 .sh
1616
1717echo
1818echo " ## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242export PRIMARY_PORT=$(( 16384 + $RANDOM % 16384 ))
4343export PRIMARY_ADDR=$( scontrol show hostnames " $SLURM_JOB_NODELIST " | head -n 1)
4444export OMP_NUM_THREADS=1
45- export HF_HOME=" /bask/projects/u/usjs9456-ati-test/"
45+ # export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646
4747echo
4848echo " ## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
6969vmstat -t 1 -y > log-train-cpu.txt &
7070
7171# Perform the prediction
72+ # Repeat this 4 times so we get better logs
7273for i in {0..3}; do
7374 srun bash -c \
7475 ' python -m torch.distributed.run \
Original file line number Diff line number Diff line change 22# vim: et:ts=4:sts=4:sw=4
33# SBATCH --qos turing
44# SBATCH --account usjs9456-ati-test
5- # SBATCH --time 0:20 :0
5+ # SBATCH --time 1:00 :0
66# SBATCH --nodes 4
77# SBATCH --ntasks-per-node 1
88# SBATCH --gpus-per-node 1
1212# SBATCH --output results/four_nodes_one_gpu.txt
1313
1414# Execute using:
15- # sbatch ./bask-train-fsdp.sh
15+ # sbatch ./bask-train-fsdp-4x4 .sh
1616
1717echo
1818echo " ## Aurora fine-tuning script starting"
@@ -42,7 +42,7 @@ echo "## Configuring environment"
4242export PRIMARY_PORT=$(( 16384 + $RANDOM % 16384 ))
4343export PRIMARY_ADDR=$( scontrol show hostnames " $SLURM_JOB_NODELIST " | head -n 1)
4444export OMP_NUM_THREADS=1
45- export HF_HOME=" /bask/projects/u/usjs9456-ati-test/"
45+ # export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4646
4747echo
4848echo " ## Initialising virtual environment"
@@ -69,6 +69,7 @@ nvidia-smi dmon -o TD -s puct -d 1 > log-train-gpu.txt &
6969vmstat -t 1 -y > log-train-cpu.txt &
7070
7171# Perform the prediction
72+ # Repeat this 4 times so we get better logs
7273for i in {0..3}; do
7374 srun bash -c \
7475 ' python -m torch.distributed.run \
Original file line number Diff line number Diff line change @@ -43,7 +43,7 @@ echo "## Configuring environment"
4343export PRIMARY_PORT=$(( 16384 + $RANDOM % 16384 ))
4444export PRIMARY_ADDR=$( scontrol show hostnames " $SLURM_JOB_NODELIST " | head -n 1)
4545export OMP_NUM_THREADS=1
46- export HF_HOME=" /bask/projects/u/usjs9456-ati-test/"
46+ # export HF_HOME="/bask/projects/u/usjs9456-ati-test/"
4747
4848echo
4949echo " ## Initialising virtual environment"
You can’t perform that action at this time.
0 commit comments