alan-turing-institute
diff --git a/‎dawn/batch/dawn-download-era5.sh‎
Lines changed: 76 additions & 0 deletions b/‎dawn/batch/dawn-download-era5.sh‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎dawn/scripts/era_v_download.py‎
Lines changed: 4 additions & 4 deletions b/‎dawn/scripts/era_v_download.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/aurora_hpc/dataset.py‎
Lines changed: 3 additions & 0 deletions b/‎src/aurora_hpc/dataset.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎train/README.md‎
Lines changed: 75 additions & 4 deletions b/‎train/README.md‎
Lines changed: 75 additions & 4 deletions
diff --git a/‎train/batch/dawn-create-venv.sh‎
Lines changed: 80 additions & 0 deletions b/‎train/batch/dawn-create-venv.sh‎
Lines changed: 80 additions & 0 deletions
@@ -0,0 +1,76 @@
+#!/bin/bash -l
+#SBATCH --job-name venv
+#SBATCH --output results/download-era5-%A.out
+#SBATCH --account airr-p8-rcpp-dawn-gpu
+#SBATCH --partition pvc9 # Dawn PVC partition
+#SBATCH --cpus-per-task 24  # Number of cores per task
+#SBATCH --nodes 1 # Number as nodes
+#SBATCH --gpus-per-node 1 # Number of requested GPUs per node
+#SBATCH --ntasks-per-node 1 # MPI ranks per node
+#SBATCH --time 02:00:00
+
+# Execute using:
+# sbatch ./dawn-download-era5.sh
+
+echo
+echo "## Aurora download ERA5 data script starting"
+
+# Quit on error
+set -e
+
+pushd ../scripts
+
+echo
+echo "## Loading modules"
+
+module purge
+module load default-dawn
+module load lua
+module load intel-oneapi-ccl/2021.14.0
+module load intel-oneapi-mpi/2021.14.1
+module load intel-oneapi-mkl/2025.0.1
+
+echo
+echo "## Configuring environment"
+
+VENV_DIR=../../dawn/environments/venv_3_11_11
+
+echo
+echo "## Initialising virtual environment"
+
+source ${VENV_DIR}/bin/activate
+
+echo
+echo "## Details"
+echo
+echo "Nodes: ${SLURM_JOB_NUM_NODES}"
+echo "GPUs per node: ${SLURM_GPUS_PER_NODE}"
+echo "Tasks per node: ${SLURM_NTASKS_PER_NODE}"
+echo "CPUS per task: ${SLURM_CPUS_PER_TASK}"
+echo "Working directory: $(realpath ${PWD})"
+echo "Location of venv: $(realpath $VENV_DIR)"
+
+echo
+echo "## Downloading data"
+
+START=$(date +%s)
+python era_v_download.py
+END=$(date +%s)
+ELAPSED=$((${END}-${START}))
+
+echo
+echo "## Details post"
+echo
+echo "Time completed: $(date --iso-8601=ns)"
+echo "Epoch start: ${START}"
+echo "Epoch end: ${END}"
+echo "Elapsed: ${ELAPSED} seconds"
+
+echo
+echo "## Tidying up"
+
+deactivate
+popd
+
+echo
+echo "## Aurora download ERA5 data script completed"
@@ -42,7 +42,7 @@
 print("Static variables downloaded!")
 
 # Download the surface-level variables.
-if not (download_path / "2023-01-surface-level.nc").exists():
+if not (download_path / "2023-01-surface-level-36.nc").exists():
     c.retrieve(
         "reanalysis-era5-single-levels",
         {
@@ -69,12 +69,12 @@
             "time": ["00:00", "06:00", "12:00", "18:00"],
             "format": "netcdf",
         },
-        str(download_path / "2023-01-surface-level.nc"),
+        str(download_path / "2023-01-surface-level-36.nc"),
     )
 print("Surface-level variables downloaded!")
 
 # Download the atmospheric variables.
-if not (download_path / "2023-01-atmospheric.nc").exists():
+if not (download_path / "2023-01-atmospheric-36.nc").exists():
     c.retrieve(
         "reanalysis-era5-pressure-levels",
         {
@@ -117,6 +117,6 @@
             "time": ["00:00", "06:00", "12:00", "18:00"],
             "format": "netcdf",
         },
-        str(download_path / "2023-01-atmospheric.nc"),
+        str(download_path / "2023-01-atmospheric-36.nc"),
     )
 print("Atmospheric variables downloaded!")
@@ -36,6 +36,7 @@ def __init__(
         surface_data: str | Path | xr.Dataset = Path("2023-01-01-surface-level.nc"),
         atmos_data: str | Path | xr.Dataset = Path("2023-01-01-atmospheric.nc"),
         use_dask: bool = False,
+        len_max: int = None,
     ):
         self.t = t
 
@@ -80,6 +81,8 @@ def __init__(
         self.length = (
             len(torch.from_numpy(self.surf_vars_ds["t2m"].values)) - self.t - 1
         )
+        if len_max:
+            self.length = min(self.length, len_max)
 
     def _get_batch(self, timerange):
         """Returns a batch covering a time range.
 
@@ -3,15 +3,17 @@
 The code in this folder is for performing various training related experiments.
 See below for instructions for how to run them.
 
-## Running within an interactive session
+## Baskerville
+
+### Running within an interactive session
 
 To run the interactives session scripts, first ensure you're on a compute node by running the following or an equiavlent `srun` command (you'll need to update the QoS and account details):
 
 ```sh
 srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
 ```
 
-## Queued jobs using sbatch
+### Queued jobs using sbatch
 
 All sbatch scripts have a QoS and account details set in them.
 The parameters used for these will depend on your account and so should be adjusted accordingly.
@@ -21,7 +23,7 @@ The parameters used for these will depend on your account and so should be adjus
 #SBATCH --account usjs9456-ati-test
 ```
 
-## Baskerville training using FSDP
+### Training using FSDP
 
 The case of a single node can be run within an srun interactive session or scheduled using the sbatch scripts.
 
@@ -63,7 +65,7 @@ sbatch bask-train-fsdp-4x4.sh
 This is set up to run on 2 nodes with 2 GPUs and to perform just a single run.
 Edit the script header to test other combinations.
 
-## Baskerville bandwidth
+### Bandwidth
 
 All bandwidth experiments should be run within an interactive session and from within the `aurora-hpc/train/batch` directory.
 
@@ -78,3 +80,72 @@ To run the GPU bandwidth experiments:
 ```sh
 ./bask-srun-gpubw.sh
 ```
+
+## Dawn
+
+The Dawn scripts are all done through batch jobs, no interactive session scripts.
+
+All scripts should be run directly from the `aurora-hpc/train/batch` directory.
+
+
+### Creating the virtual environment
+
+Before performing any training the virtual environment should be created by running the following script:
+
+```sh
+cd aurora-hpc/train/batch
+sbatch dawn-create-venv.sh
+```
+
+This will create a virtual environment in the `aurora-hpc/dawn/environments/venv_3_11_11` directory.
+
+## Download the data
+
+The data must also be downloaded before training can commence.
+This also requires that you've created an account with the Climate Data Store and created a `.cdsapirc` file in your homd directory with the following contents:
+
+```sh
+url: https://cds.climate.copernicus.eu/api
+key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxxx
+```
+
+Here the `x` values must be replaced by your access key.
+You can find out more about how to do this on the [Aurora ERA5 page](https://microsoft.github.io/aurora/example_era5.html).
+
+Once you've set up your access key you can then download the data directly to Dawn using the following sbatch script.
+
+```sh
+aurora-hpc/dawn/batch/dawn-download-era5.sh
+```
+
+If successful this will result in the following files being downoaded to the `aurora-hpc/dawn/era5/era_v_inf` directory.
+
+```
+2023-01-atmospheric-36.nc
+2023-01-surface-level-36.nc
+static.nc
+```
+
+### Training
+
+Once the virtual environment is set up, the training can be executed by queueing the appropriate script for the number of nodes and GPUs you want to use.
+
+The following example is for one node and one GPU:
+
+```sh
+cd aurora-hpc/train/batch
+sbatch dawn-train-ddp-1x1.sh
+```
+
+The other available configurations are the following:
+
+```sh
+dawn-train-ddp-1x1.sh # One node with one GPU (one GPU total)
+dawn-train-ddp-1x4.sh # One node with four GPUs (four GPUs total)
+dawn-train-ddp-2x4.sh # Two nodes with two GPUs each (four GPUs total)
+dawn-train-ddp-2x8.sh # Two nodes with four GPUs each (eight GPUs total)
+dawn-train-ddp-4x4.sh # Four nodes with one GPU each (four GPUs total)
+dawn-train-ddp-4x8.sh # Four nodes with two GUUs each (eight GPUs total)
+```
+
+After each run the output logs will be sent to the `aurora-hpc/train/batch/results` directory.
@@ -0,0 +1,80 @@
+#!/bin/bash -l
+#SBATCH --job-name venv
+#SBATCH --output results/create-venv-%A.out
+#SBATCH --account airr-p8-rcpp-dawn-gpu
+#SBATCH --partition pvc9 # Dawn PVC partition
+#SBATCH --cpus-per-task 24  # Number of cores per task
+#SBATCH --nodes 1 # Number as nodes
+#SBATCH --gpus-per-node 1 # Number of requested GPUs per node
+#SBATCH --ntasks-per-node 1 # MPI ranks per node
+#SBATCH --time 01:00:00
+
+# Execute using:
+# sbatch ./dawn-create-venv.sh
+
+echo
+echo "## Aurora create virtual environment script starting"
+
+# Quit on error
+set -e
+
+pushd ../scripts
+
+echo
+echo "## Loading modules"
+
+module purge
+module load default-dawn
+module load lua
+module load intel-oneapi-ccl/2021.14.0
+module load intel-oneapi-mpi/2021.14.1
+module load intel-oneapi-mkl/2025.0.1
+
+echo
+echo "## Configuring environment"
+
+VENV_DIR=../../dawn/environments/venv_3_11_11
+
+echo
+echo "## Initialising virtual environment"
+
+python3.11 -m venv $VENV_DIR
+. ${VENV_DIR}/bin/activate
+
+echo
+echo "## Details"
+echo
+echo "Nodes: ${SLURM_JOB_NUM_NODES}"
+echo "GPUs per node: ${SLURM_GPUS_PER_NODE}"
+echo "Tasks per node: ${SLURM_NTASKS_PER_NODE}"
+echo "CPUS per task: ${SLURM_CPUS_PER_TASK}"
+echo "Working directory: $(realpath ${PWD})"
+echo "Location of venv: $(realpath $VENV_DIR)"
+
+echo
+echo "## Installing packages"
+
+START=$(date +%s)
+pip install --upgrade pip
+pip install -e ../../.[dawn]
+pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu
+pip install --trusted-host pytorch-extension.intel.com intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+END=$(date +%s)
+ELAPSED=$((${END}-${START}))
+
+echo
+echo "## Details post"
+echo
+echo "Time completed: $(date --iso-8601=ns)"
+echo "Epoch start: ${START}"
+echo "Epoch end: ${END}"
+echo "Elapsed: ${ELAPSED} seconds"
+
+echo
+echo "## Tidying up"
+
+deactivate
+popd
+
+echo
+echo "## Aurora create virtual environmnent script completed"