Skip to content

Commit a996491

Browse files
authored
Merge pull request #46 from alan-turing-institute/train_timing_flat
Add scripts for flat vs composite comparison
2 parents f2032d4 + 2eec3f8 commit a996491

File tree

8 files changed

+535
-0
lines changed

8 files changed

+535
-0
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash -l
2+
#SBATCH --job-name=1x1
3+
#SBATCH --output=results/composite_one_node_one_gpu.out
4+
#SBATCH --account=airr-p8-rcpp-dawn-gpu
5+
#SBATCH --partition=pvc9 # Dawn PVC partition
6+
#SBATCH -N 1 # Number as nodes
7+
#SBATCH --gres=gpu:1 # Number of requested GPUs per node
8+
#SBATCH --ntasks-per-node=1 # MPI ranks per node
9+
#SBATCH --time 01:00:00
10+
11+
# 1 node, 1 GPU
12+
# For this we don't need to 'skip' any GPUs
13+
14+
#set -o xtrace
15+
set -o errexit
16+
17+
module purge
18+
module load default-dawn
19+
module load lua
20+
module load intel-oneapi-ccl/2021.14.0
21+
module load intel-oneapi-mpi/2021.14.1
22+
module load intel-oneapi-mkl/2025.0.1
23+
24+
pushd ../../scripts
25+
26+
source ../../dawn/environments/venv_3_11_9/bin/activate
27+
28+
export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
29+
30+
# Avoid too many open file handles error.
31+
ulimit -n 1000000
32+
33+
# Avoid mpi failing to init.
34+
export CCL_ATL_TRANSPORT=ofi
35+
export FI_PROVIDER=verbs
36+
37+
# Avoids segfaults, for some reason.
38+
export ZES_ENABLE_SYSMAN=1
39+
40+
# Otherwise we're told to.
41+
export CCL_ZE_IPC_EXCHANGE=sockets
42+
43+
#mpirun -host ${SLURM_JOB_NODELIST} bash -c 'stdbuf -o0 xpu-smi dump --rawdata --device $SLURM_JOB_GPUS -m 0,1,2,21,22 > gpu-${SLURM_JOB_ID}-${OMPI_COMM_WORLD_RANK}.txt' &
44+
45+
mpirun -prepend-rank -n 1 -ppn 1 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
46+
47+
deactivate
48+
popd
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash -l
2+
#SBATCH --job-name=1x1
3+
#SBATCH --output=results/flat_one_node_one_gpu_one_tile.out
4+
#SBATCH --account=airr-p8-rcpp-dawn-gpu
5+
#SBATCH --partition=pvc9 # Dawn PVC partition
6+
#SBATCH -N 1 # Number as nodes
7+
#SBATCH --gres=gpu:1 # Number of requested GPUs per node
8+
#SBATCH --ntasks-per-node=1 # MPI ranks per node
9+
#SBATCH --time 02:00:00
10+
11+
# 1 node, 1 GPU
12+
# For this we don't need to 'skip' any GPUs
13+
14+
#set -o xtrace
15+
set -o errexit
16+
17+
module purge
18+
module load default-dawn
19+
module load lua
20+
module load intel-oneapi-ccl/2021.14.0
21+
module load intel-oneapi-mpi/2021.14.1
22+
module load intel-oneapi-mkl/2025.0.1
23+
24+
pushd ../../scripts
25+
26+
source ../../dawn/environments/venv_3_11_9/bin/activate
27+
28+
export ZE_FLAT_DEVICE_HIERARCHY=FLAT
29+
30+
# Avoid too many open file handles error.
31+
ulimit -n 1000000
32+
33+
# Avoid mpi failing to init.
34+
export CCL_ATL_TRANSPORT=ofi
35+
export FI_PROVIDER=verbs
36+
37+
# Avoids segfaults, for some reason.
38+
export ZES_ENABLE_SYSMAN=1
39+
40+
# Otherwise we're told to.
41+
export CCL_ZE_IPC_EXCHANGE=sockets
42+
43+
#mpirun -host ${SLURM_JOB_NODELIST} bash -c 'stdbuf -o0 xpu-smi dump --rawdata --device $SLURM_JOB_GPUS -m 0,1,2,21,22 > gpu-${SLURM_JOB_ID}-${OMPI_COMM_WORLD_RANK}.txt' &
44+
45+
mpirun -prepend-rank -n 1 -ppn 1 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
46+
47+
deactivate
48+
popd
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash -l
2+
#SBATCH --job-name=1x1
3+
#SBATCH --output=results/flat_one_node_one_gpu_two_tiles.out
4+
#SBATCH --account=airr-p8-rcpp-dawn-gpu
5+
#SBATCH --partition=pvc9 # Dawn PVC partition
6+
#SBATCH -N 1 # Number as nodes
7+
#SBATCH --gres=gpu:1 # Number of requested GPUs per node
8+
#SBATCH --ntasks-per-node=2 # MPI ranks per node
9+
#SBATCH --time 01:00:00
10+
11+
# 1 node, 1 GPU
12+
# For this we don't need to 'skip' any GPUs
13+
14+
#set -o xtrace
15+
set -o errexit
16+
17+
module purge
18+
module load default-dawn
19+
module load lua
20+
module load intel-oneapi-ccl/2021.14.0
21+
module load intel-oneapi-mpi/2021.14.1
22+
module load intel-oneapi-mkl/2025.0.1
23+
24+
pushd ../../scripts
25+
26+
source ../../dawn/environments/venv_3_11_9/bin/activate
27+
28+
export ZE_FLAT_DEVICE_HIERARCHY=FLAT
29+
30+
# Avoid too many open file handles error.
31+
ulimit -n 1000000
32+
33+
# Avoid mpi failing to init.
34+
export CCL_ATL_TRANSPORT=ofi
35+
export FI_PROVIDER=verbs
36+
37+
# Avoids segfaults, for some reason.
38+
export ZES_ENABLE_SYSMAN=1
39+
40+
# Otherwise we're told to.
41+
export CCL_ZE_IPC_EXCHANGE=sockets
42+
43+
mpirun -prepend-rank -n 2 -ppn 2 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
44+
45+
deactivate
46+
popd
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash -l
2+
#SBATCH --job-name=1x2
3+
#SBATCH --output=results/composite_one_node_two_gpus.out
4+
#SBATCH --account=airr-p8-rcpp-dawn-gpu
5+
#SBATCH --partition=pvc9 # Dawn PVC partition
6+
#SBATCH -N 1 # Number as nodes
7+
#SBATCH --gres=gpu:2 # Number of requested GPUs per node
8+
#SBATCH --ntasks-per-node=2 # MPI ranks per node
9+
#SBATCH --time 01:00:00
10+
11+
# 1 node, 2 GPUs
12+
# For this we don't need to 'skip' any GPUs
13+
14+
#set -o xtrace
15+
set -o errexit
16+
17+
module purge
18+
module load default-dawn
19+
module load lua
20+
module load intel-oneapi-ccl/2021.14.0
21+
module load intel-oneapi-mpi/2021.14.1
22+
module load intel-oneapi-mkl/2025.0.1
23+
24+
pushd ../../scripts
25+
26+
source ../../dawn/environments/venv_3_11_9/bin/activate
27+
28+
export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
29+
30+
# Avoid too many open file handles error.
31+
ulimit -n 1000000
32+
33+
# Avoid mpi failing to init.
34+
export CCL_ATL_TRANSPORT=ofi
35+
export FI_PROVIDER=verbs
36+
37+
# Avoids segfaults, for some reason.
38+
export ZES_ENABLE_SYSMAN=1
39+
40+
# Otherwise we're told to.
41+
export CCL_ZE_IPC_EXCHANGE=sockets
42+
43+
mpirun -prepend-rank -n 2 -ppn 2 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
44+
45+
deactivate
46+
popd
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash -l
2+
#SBATCH --job-name=1x4
3+
#SBATCH --output=results/composite_one_node_four_gpus.out
4+
#SBATCH --account=airr-p8-rcpp-dawn-gpu
5+
#SBATCH --partition=pvc9 # Dawn PVC partition
6+
#SBATCH -N 1 # Number as nodes
7+
#SBATCH --gres=gpu:4 # Number of requested GPUs per node
8+
#SBATCH --ntasks-per-node=4 # MPI ranks per node
9+
#SBATCH --time 01:00:00
10+
11+
# 1 node, 4 GPUs
12+
# For this we don't need to 'skip' any GPUs
13+
14+
#set -o xtrace
15+
set -o errexit
16+
17+
module purge
18+
module load default-dawn
19+
module load lua
20+
module load intel-oneapi-ccl/2021.14.0
21+
module load intel-oneapi-mpi/2021.14.1
22+
module load intel-oneapi-mkl/2025.0.1
23+
24+
pushd ../../scripts
25+
26+
source ../../dawn/environments/venv_3_11_9/bin/activate
27+
28+
export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
29+
30+
# Avoid too many open file handles error.
31+
ulimit -n 1000000
32+
33+
# Avoid mpi failing to init.
34+
export CCL_ATL_TRANSPORT=ofi
35+
export FI_PROVIDER=verbs
36+
37+
# Avoids segfaults, for some reason.
38+
export ZES_ENABLE_SYSMAN=1
39+
40+
# Otherwise we're told to.
41+
export CCL_ZE_IPC_EXCHANGE=sockets
42+
43+
mpirun -prepend-rank -n 4 -ppn 4 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
44+
45+
deactivate
46+
popd
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash -l
2+
#SBATCH --job-name=1x4
3+
#SBATCH --output=results/flat_one_node_four_gpus_four_tiles.out
4+
#SBATCH --account=airr-p8-rcpp-dawn-gpu
5+
#SBATCH --partition=pvc9 # Dawn PVC partition
6+
#SBATCH -N 1 # Number as nodes
7+
#SBATCH --gres=gpu:4 # Number of requested GPUs per node
8+
#SBATCH --ntasks-per-node=4 # MPI ranks per node
9+
#SBATCH --time 01:00:00
10+
11+
# 1 node, 4 GPUs
12+
# For this we don't need to 'skip' any GPUs
13+
14+
#set -o xtrace
15+
set -o errexit
16+
17+
module purge
18+
module load default-dawn
19+
module load lua
20+
module load intel-oneapi-ccl/2021.14.0
21+
module load intel-oneapi-mpi/2021.14.1
22+
module load intel-oneapi-mkl/2025.0.1
23+
24+
pushd ../../scripts
25+
26+
source ../../dawn/environments/venv_3_11_9/bin/activate
27+
28+
export ZE_FLAT_DEVICE_HIERARCHY=FLAT
29+
30+
# Avoid too many open file handles error.
31+
ulimit -n 1000000
32+
33+
# Avoid mpi failing to init.
34+
export CCL_ATL_TRANSPORT=ofi
35+
export FI_PROVIDER=verbs
36+
37+
# Avoids segfaults, for some reason.
38+
export ZES_ENABLE_SYSMAN=1
39+
40+
# Otherwise we're told to.
41+
export CCL_ZE_IPC_EXCHANGE=sockets
42+
43+
mpirun -prepend-rank -n 4 -ppn 4 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
44+
45+
deactivate
46+
popd
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash -l
2+
#SBATCH --job-name=1x4
3+
#SBATCH --output=results/flat_one_node_four_gpus_eight_tiles.out
4+
#SBATCH --account=airr-p8-rcpp-dawn-gpu
5+
#SBATCH --partition=pvc9 # Dawn PVC partition
6+
#SBATCH -N 1 # Number as nodes
7+
#SBATCH --gres=gpu:4 # Number of requested GPUs per node
8+
#SBATCH --ntasks-per-node=8 # MPI ranks per node
9+
#SBATCH --time 01:00:00
10+
11+
# 1 node, 4 GPUs
12+
# For this we don't need to 'skip' any GPUs
13+
14+
#set -o xtrace
15+
set -o errexit
16+
17+
module purge
18+
module load default-dawn
19+
module load lua
20+
module load intel-oneapi-ccl/2021.14.0
21+
module load intel-oneapi-mpi/2021.14.1
22+
module load intel-oneapi-mkl/2025.0.1
23+
24+
pushd ../../scripts
25+
26+
source ../../dawn/environments/venv_3_11_9/bin/activate
27+
28+
export ZE_FLAT_DEVICE_HIERARCHY=FLAT
29+
30+
# Avoid too many open file handles error.
31+
ulimit -n 1000000
32+
33+
# Avoid mpi failing to init.
34+
export CCL_ATL_TRANSPORT=ofi
35+
export FI_PROVIDER=verbs
36+
37+
# Avoids segfaults, for some reason.
38+
export ZES_ENABLE_SYSMAN=1
39+
40+
# Otherwise we're told to.
41+
export CCL_ZE_IPC_EXCHANGE=sockets
42+
43+
mpirun -prepend-rank -n 8 -ppn 8 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
44+
45+
deactivate
46+
popd

0 commit comments

Comments
 (0)