Merge pull request #46 from alan-turing-institute/train_timing_flat

rwood-97 · web-flow · commit a996491a37ae · 2025-08-12T17:47:56.000+01:00
Add scripts for flat vs composite comparison
diff --git a/train/batch/flat_vs_composite/dawn-train-ddp-1x1_composite.sh b/train/batch/flat_vs_composite/dawn-train-ddp-1x1_composite.sh
@@ -0,0 +1,48 @@
+#!/bin/bash -l
+#SBATCH --job-name=1x1
+#SBATCH --output=results/composite_one_node_one_gpu.out
+#SBATCH --account=airr-p8-rcpp-dawn-gpu
+#SBATCH --partition=pvc9 # Dawn PVC partition
+#SBATCH -N 1 # Number as nodes
+#SBATCH --gres=gpu:1 # Number of requested GPUs per node
+#SBATCH --ntasks-per-node=1 # MPI ranks per node
+#SBATCH --time 01:00:00
+
+# 1 node, 1 GPU
+# For this we don't need to 'skip' any GPUs
+
+#set -o xtrace
+set -o errexit
+
+module purge
+module load default-dawn
+module load lua
+module load intel-oneapi-ccl/2021.14.0
+module load intel-oneapi-mpi/2021.14.1
+module load intel-oneapi-mkl/2025.0.1
+
+pushd ../../scripts
+
+source ../../dawn/environments/venv_3_11_9/bin/activate
+
+export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
+
+# Avoid too many open file handles error.
+ulimit -n 1000000
+
+# Avoid mpi failing to init.
+export CCL_ATL_TRANSPORT=ofi
+export FI_PROVIDER=verbs
+
+# Avoids segfaults, for some reason.
+export ZES_ENABLE_SYSMAN=1
+
+# Otherwise we're told to.
+export CCL_ZE_IPC_EXCHANGE=sockets
+
+#mpirun -host ${SLURM_JOB_NODELIST} bash -c 'stdbuf -o0 xpu-smi dump --rawdata --device $SLURM_JOB_GPUS -m 0,1,2,21,22 > gpu-${SLURM_JOB_ID}-${OMPI_COMM_WORLD_RANK}.txt' &
+
+mpirun -prepend-rank -n 1 -ppn 1 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
+
+deactivate
+popd
diff --git a/train/batch/flat_vs_composite/dawn-train-ddp-1x1x1_flat.sh b/train/batch/flat_vs_composite/dawn-train-ddp-1x1x1_flat.sh
@@ -0,0 +1,48 @@
+#!/bin/bash -l
+#SBATCH --job-name=1x1
+#SBATCH --output=results/flat_one_node_one_gpu_one_tile.out
+#SBATCH --account=airr-p8-rcpp-dawn-gpu
+#SBATCH --partition=pvc9 # Dawn PVC partition
+#SBATCH -N 1 # Number as nodes
+#SBATCH --gres=gpu:1 # Number of requested GPUs per node
+#SBATCH --ntasks-per-node=1 # MPI ranks per node
+#SBATCH --time 02:00:00
+
+# 1 node, 1 GPU
+# For this we don't need to 'skip' any GPUs
+
+#set -o xtrace
+set -o errexit
+
+module purge
+module load default-dawn
+module load lua
+module load intel-oneapi-ccl/2021.14.0
+module load intel-oneapi-mpi/2021.14.1
+module load intel-oneapi-mkl/2025.0.1
+
+pushd ../../scripts
+
+source ../../dawn/environments/venv_3_11_9/bin/activate
+
+export ZE_FLAT_DEVICE_HIERARCHY=FLAT
+
+# Avoid too many open file handles error.
+ulimit -n 1000000
+
+# Avoid mpi failing to init.
+export CCL_ATL_TRANSPORT=ofi
+export FI_PROVIDER=verbs
+
+# Avoids segfaults, for some reason.
+export ZES_ENABLE_SYSMAN=1
+
+# Otherwise we're told to.
+export CCL_ZE_IPC_EXCHANGE=sockets
+
+#mpirun -host ${SLURM_JOB_NODELIST} bash -c 'stdbuf -o0 xpu-smi dump --rawdata --device $SLURM_JOB_GPUS -m 0,1,2,21,22 > gpu-${SLURM_JOB_ID}-${OMPI_COMM_WORLD_RANK}.txt' &
+
+mpirun -prepend-rank -n 1 -ppn 1 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
+
+deactivate
+popd
diff --git a/train/batch/flat_vs_composite/dawn-train-ddp-1x1x2_flat.sh b/train/batch/flat_vs_composite/dawn-train-ddp-1x1x2_flat.sh
@@ -0,0 +1,46 @@
+#!/bin/bash -l
+#SBATCH --job-name=1x1
+#SBATCH --output=results/flat_one_node_one_gpu_two_tiles.out
+#SBATCH --account=airr-p8-rcpp-dawn-gpu
+#SBATCH --partition=pvc9 # Dawn PVC partition
+#SBATCH -N 1 # Number as nodes
+#SBATCH --gres=gpu:1 # Number of requested GPUs per node
+#SBATCH --ntasks-per-node=2 # MPI ranks per node
+#SBATCH --time 01:00:00
+
+# 1 node, 1 GPU
+# For this we don't need to 'skip' any GPUs
+
+#set -o xtrace
+set -o errexit
+
+module purge
+module load default-dawn
+module load lua
+module load intel-oneapi-ccl/2021.14.0
+module load intel-oneapi-mpi/2021.14.1
+module load intel-oneapi-mkl/2025.0.1
+
+pushd ../../scripts
+
+source ../../dawn/environments/venv_3_11_9/bin/activate
+
+export ZE_FLAT_DEVICE_HIERARCHY=FLAT
+
+# Avoid too many open file handles error.
+ulimit -n 1000000
+
+# Avoid mpi failing to init.
+export CCL_ATL_TRANSPORT=ofi
+export FI_PROVIDER=verbs
+
+# Avoids segfaults, for some reason.
+export ZES_ENABLE_SYSMAN=1
+
+# Otherwise we're told to.
+export CCL_ZE_IPC_EXCHANGE=sockets
+
+mpirun -prepend-rank -n 2 -ppn 2 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
+
+deactivate
+popd
diff --git a/train/batch/flat_vs_composite/dawn-train-ddp-1x2_composite.sh b/train/batch/flat_vs_composite/dawn-train-ddp-1x2_composite.sh
@@ -0,0 +1,46 @@
+#!/bin/bash -l
+#SBATCH --job-name=1x2
+#SBATCH --output=results/composite_one_node_two_gpus.out
+#SBATCH --account=airr-p8-rcpp-dawn-gpu
+#SBATCH --partition=pvc9 # Dawn PVC partition
+#SBATCH -N 1 # Number as nodes
+#SBATCH --gres=gpu:2 # Number of requested GPUs per node
+#SBATCH --ntasks-per-node=2 # MPI ranks per node
+#SBATCH --time 01:00:00
+
+# 1 node, 2 GPUs
+# For this we don't need to 'skip' any GPUs
+
+#set -o xtrace
+set -o errexit
+
+module purge
+module load default-dawn
+module load lua
+module load intel-oneapi-ccl/2021.14.0
+module load intel-oneapi-mpi/2021.14.1
+module load intel-oneapi-mkl/2025.0.1
+
+pushd ../../scripts
+
+source ../../dawn/environments/venv_3_11_9/bin/activate
+
+export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
+
+# Avoid too many open file handles error.
+ulimit -n 1000000
+
+# Avoid mpi failing to init.
+export CCL_ATL_TRANSPORT=ofi
+export FI_PROVIDER=verbs
+
+# Avoids segfaults, for some reason.
+export ZES_ENABLE_SYSMAN=1
+
+# Otherwise we're told to.
+export CCL_ZE_IPC_EXCHANGE=sockets
+
+mpirun -prepend-rank -n 2 -ppn 2 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
+
+deactivate
+popd
diff --git a/train/batch/flat_vs_composite/dawn-train-ddp-1x4_composite.sh b/train/batch/flat_vs_composite/dawn-train-ddp-1x4_composite.sh
@@ -0,0 +1,46 @@
+#!/bin/bash -l
+#SBATCH --job-name=1x4
+#SBATCH --output=results/composite_one_node_four_gpus.out
+#SBATCH --account=airr-p8-rcpp-dawn-gpu
+#SBATCH --partition=pvc9 # Dawn PVC partition
+#SBATCH -N 1 # Number as nodes
+#SBATCH --gres=gpu:4 # Number of requested GPUs per node
+#SBATCH --ntasks-per-node=4 # MPI ranks per node
+#SBATCH --time 01:00:00
+
+# 1 node, 4 GPUs
+# For this we don't need to 'skip' any GPUs
+
+#set -o xtrace
+set -o errexit
+
+module purge
+module load default-dawn
+module load lua
+module load intel-oneapi-ccl/2021.14.0
+module load intel-oneapi-mpi/2021.14.1
+module load intel-oneapi-mkl/2025.0.1
+
+pushd ../../scripts
+
+source ../../dawn/environments/venv_3_11_9/bin/activate
+
+export ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE
+
+# Avoid too many open file handles error.
+ulimit -n 1000000
+
+# Avoid mpi failing to init.
+export CCL_ATL_TRANSPORT=ofi
+export FI_PROVIDER=verbs
+
+# Avoids segfaults, for some reason.
+export ZES_ENABLE_SYSMAN=1
+
+# Otherwise we're told to.
+export CCL_ZE_IPC_EXCHANGE=sockets
+
+mpirun -prepend-rank -n 4 -ppn 4 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
+
+deactivate
+popd
diff --git a/train/batch/flat_vs_composite/dawn-train-ddp-1x4x4_flat.sh b/train/batch/flat_vs_composite/dawn-train-ddp-1x4x4_flat.sh
@@ -0,0 +1,46 @@
+#!/bin/bash -l
+#SBATCH --job-name=1x4
+#SBATCH --output=results/flat_one_node_four_gpus_four_tiles.out
+#SBATCH --account=airr-p8-rcpp-dawn-gpu
+#SBATCH --partition=pvc9 # Dawn PVC partition
+#SBATCH -N 1 # Number as nodes
+#SBATCH --gres=gpu:4 # Number of requested GPUs per node
+#SBATCH --ntasks-per-node=4 # MPI ranks per node
+#SBATCH --time 01:00:00
+
+# 1 node, 4 GPUs
+# For this we don't need to 'skip' any GPUs
+
+#set -o xtrace
+set -o errexit
+
+module purge
+module load default-dawn
+module load lua
+module load intel-oneapi-ccl/2021.14.0
+module load intel-oneapi-mpi/2021.14.1
+module load intel-oneapi-mkl/2025.0.1
+
+pushd ../../scripts
+
+source ../../dawn/environments/venv_3_11_9/bin/activate
+
+export ZE_FLAT_DEVICE_HIERARCHY=FLAT
+
+# Avoid too many open file handles error.
+ulimit -n 1000000
+
+# Avoid mpi failing to init.
+export CCL_ATL_TRANSPORT=ofi
+export FI_PROVIDER=verbs
+
+# Avoids segfaults, for some reason.
+export ZES_ENABLE_SYSMAN=1
+
+# Otherwise we're told to.
+export CCL_ZE_IPC_EXCHANGE=sockets
+
+mpirun -prepend-rank -n 4 -ppn 4 python train_ed.py -d ../../dawn/era5/era_v_inf/ --xpu
+
+deactivate
+popd
diff --git a/train/batch/flat_vs_composite/dawn-train-ddp-1x4x8_flat.sh b/train/batch/flat_vs_composite/dawn-train-ddp-1x4x8_flat.sh
@@ -0,0 +1,46 @@
+#!/bin/bash -l
+#SBATCH --job-name=1x4
+#SBATCH --output=results/flat_one_node_four_gpus_eight_tiles.out
+#SBATCH --account=airr-p8-rcpp-dawn-gpu
+#SBATCH --partition=pvc9 # Dawn PVC partition
+#SBATCH -N 1 # Number as nodes
+#SBATCH --gres=gpu:4 # Number of requested GPUs per node
+#SBATCH --ntasks-per-node=8 # MPI ranks per node
+#SBATCH --time 01:00:00
+
+# 1 node, 4 GPUs
+# For this we don't need to 'skip' any GPUs
+
+#set -o xtrace
+set -o errexit
+
+module purge
+module load default-dawn
+module load lua
+module load intel-oneapi-ccl/2021.14.0
+module load intel-oneapi-mpi/2021.14.1
+module load intel-oneapi-mkl/2025.0.1
+
+pushd ../../scripts
+
+source ../../dawn/environments/venv_3_11_9/bin/activate
+
+export ZE_FLAT_DEVICE_HIERARCHY=FLAT
+
+# Avoid too many open file handles error.
+ulimit -n 1000000
+
+# Avoid mpi failing to init.
+export CCL_ATL_TRANSPORT=ofi
+export FI_PROVIDER=verbs
+
+# Avoids segfaults, for some reason.
+export ZES_ENABLE_SYSMAN=1
+
+# Otherwise we're told to.
+export CCL_ZE_IPC_EXCHANGE=sockets
+
+mpirun -prepend-rank -n 8 -ppn 8 python train_ed.py -d ../../dawn/era5/era_v_inf/  --xpu
+
+deactivate
+popd
diff --git a/train/scripts/train_ed.py b/train/scripts/train_ed.py