Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions dawn/batch/dawn-download-era5.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash -l
#SBATCH --job-name venv
#SBATCH --output results/download-era5-%A.out
#SBATCH --account airr-p8-rcpp-dawn-gpu
#SBATCH --partition pvc9 # Dawn PVC partition
#SBATCH --cpus-per-task 24 # Number of cores per task
#SBATCH --nodes 1 # Number as nodes
#SBATCH --gpus-per-node 1 # Number of requested GPUs per node
#SBATCH --ntasks-per-node 1 # MPI ranks per node
#SBATCH --time 02:00:00

# Execute using:
# sbatch ./dawn-download-era5.sh

echo
echo "## Aurora download ERA5 data script starting"

# Quit on error
set -e

pushd ../scripts

echo
echo "## Loading modules"

module purge
module load default-dawn
module load lua
module load intel-oneapi-ccl/2021.14.0
module load intel-oneapi-mpi/2021.14.1
module load intel-oneapi-mkl/2025.0.1

echo
echo "## Configuring environment"

VENV_DIR=../../dawn/environments/venv_3_11_11

echo
echo "## Initialising virtual environment"

source ${VENV_DIR}/bin/activate

echo
echo "## Details"
echo
echo "Nodes: ${SLURM_JOB_NUM_NODES}"
echo "GPUs per node: ${SLURM_GPUS_PER_NODE}"
echo "Tasks per node: ${SLURM_NTASKS_PER_NODE}"
echo "CPUS per task: ${SLURM_CPUS_PER_TASK}"
echo "Working directory: $(realpath ${PWD})"
echo "Location of venv: $(realpath $VENV_DIR)"

echo
echo "## Downloading data"

START=$(date +%s)
python era_v_download.py
END=$(date +%s)
ELAPSED=$((${END}-${START}))

echo
echo "## Details post"
echo
echo "Time completed: $(date --iso-8601=ns)"
echo "Epoch start: ${START}"
echo "Epoch end: ${END}"
echo "Elapsed: ${ELAPSED} seconds"

echo
echo "## Tidying up"

deactivate
popd

echo
echo "## Aurora download ERA5 data script completed"
9 changes: 4 additions & 5 deletions dawn/scripts/era_v_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

c = cdsapi.Client(
url="https://cds.climate.copernicus.eu/api",
key=os.environ["CDSAPI_KEY"],
debug=True,
)

Expand Down Expand Up @@ -42,7 +41,7 @@
print("Static variables downloaded!")

# Download the surface-level variables.
if not (download_path / "2023-01-surface-level.nc").exists():
if not (download_path / "2023-01-surface-level-36.nc").exists():
c.retrieve(
"reanalysis-era5-single-levels",
{
Expand All @@ -69,12 +68,12 @@
"time": ["00:00", "06:00", "12:00", "18:00"],
"format": "netcdf",
},
str(download_path / "2023-01-surface-level.nc"),
str(download_path / "2023-01-surface-level-36.nc"),
)
print("Surface-level variables downloaded!")

# Download the atmospheric variables.
if not (download_path / "2023-01-atmospheric.nc").exists():
if not (download_path / "2023-01-atmospheric-36.nc").exists():
c.retrieve(
"reanalysis-era5-pressure-levels",
{
Expand Down Expand Up @@ -117,6 +116,6 @@
"time": ["00:00", "06:00", "12:00", "18:00"],
"format": "netcdf",
},
str(download_path / "2023-01-atmospheric.nc"),
str(download_path / "2023-01-atmospheric-36.nc"),
)
print("Atmospheric variables downloaded!")
3 changes: 3 additions & 0 deletions src/aurora_hpc/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(
surface_data: str | Path | xr.Dataset = Path("2023-01-01-surface-level.nc"),
atmos_data: str | Path | xr.Dataset = Path("2023-01-01-atmospheric.nc"),
use_dask: bool = False,
len_max: int = None,
):
self.t = t

Expand Down Expand Up @@ -80,6 +81,8 @@ def __init__(
self.length = (
len(torch.from_numpy(self.surf_vars_ds["t2m"].values)) - self.t - 1
)
if len_max:
self.length = min(self.length, len_max)

def _get_batch(self, timerange):
"""Returns a batch covering a time range.
Expand Down
79 changes: 75 additions & 4 deletions train/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
The code in this folder is for performing various training related experiments.
See below for instructions for how to run them.

## Running within an interactive session
## Baskerville

### Running within an interactive session

To run the interactives session scripts, first ensure you're on a compute node by running the following or an equiavlent `srun` command (you'll need to update the QoS and account details):

```sh
srun --qos turing --account usjs9456-ati-test --time 1:00:00 --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 16384 --pty /bin/bash
```

## Queued jobs using sbatch
### Queued jobs using sbatch

All sbatch scripts have a QoS and account details set in them.
The parameters used for these will depend on your account and so should be adjusted accordingly.
Expand All @@ -21,7 +23,7 @@ The parameters used for these will depend on your account and so should be adjus
#SBATCH --account usjs9456-ati-test
```

## Baskerville training using FSDP
### Training using FSDP

The case of a single node can be run within an srun interactive session or scheduled using the sbatch scripts.

Expand Down Expand Up @@ -63,7 +65,7 @@ sbatch bask-train-fsdp-4x4.sh
This is set up to run on 2 nodes with 2 GPUs and to perform just a single run.
Edit the script header to test other combinations.

## Baskerville bandwidth
### Bandwidth

All bandwidth experiments should be run within an interactive session and from within the `aurora-hpc/train/batch` directory.

Expand All @@ -78,3 +80,72 @@ To run the GPU bandwidth experiments:
```sh
./bask-srun-gpubw.sh
```

## Dawn

The Dawn scripts are all done through batch jobs, no interactive session scripts.

All scripts should be run directly from the `aurora-hpc/train/batch` directory.


### Creating the virtual environment

Before performing any training the virtual environment should be created by running the following script:

```sh
cd aurora-hpc/train/batch
sbatch dawn-create-venv.sh
```

This will create a virtual environment in the `aurora-hpc/dawn/environments/venv_3_11_11` directory.

## Download the data

The data must also be downloaded before training can commence.
This also requires that you've created an account with the Climate Data Store and created a `.cdsapirc` file in your homd directory with the following contents:

```sh
url: https://cds.climate.copernicus.eu/api
key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxxx
```

Here the `x` values must be replaced by your access key.
You can find out more about how to do this on the [Aurora ERA5 page](https://microsoft.github.io/aurora/example_era5.html).

Once you've set up your access key you can then download the data directly to Dawn using the following sbatch script.

```sh
aurora-hpc/dawn/batch/dawn-download-era5.sh
```

If successful this will result in the following files being downoaded to the `aurora-hpc/dawn/era5/era_v_inf` directory.

```
2023-01-atmospheric-36.nc
2023-01-surface-level-36.nc
static.nc
```

### Training

Once the virtual environment is set up, the training can be executed by queueing the appropriate script for the number of nodes and GPUs you want to use.

The following example is for one node and one GPU:

```sh
cd aurora-hpc/train/batch
sbatch dawn-train-ddp-1x1.sh
```

The other available configurations are the following:

```sh
dawn-train-ddp-1x1.sh # One node with one GPU (one GPU total)
dawn-train-ddp-1x4.sh # One node with four GPUs (four GPUs total)
dawn-train-ddp-2x4.sh # Two nodes with two GPUs each (four GPUs total)
dawn-train-ddp-2x8.sh # Two nodes with four GPUs each (eight GPUs total)
dawn-train-ddp-4x4.sh # Four nodes with one GPU each (four GPUs total)
dawn-train-ddp-4x8.sh # Four nodes with two GUUs each (eight GPUs total)
```

After each run the output logs will be sent to the `aurora-hpc/train/batch/results` directory.
80 changes: 80 additions & 0 deletions train/batch/dawn-create-venv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/bin/bash -l
#SBATCH --job-name venv
#SBATCH --output results/create-venv-%A.out
#SBATCH --account airr-p8-rcpp-dawn-gpu
#SBATCH --partition pvc9 # Dawn PVC partition
#SBATCH --cpus-per-task 24 # Number of cores per task
#SBATCH --nodes 1 # Number as nodes
#SBATCH --gpus-per-node 1 # Number of requested GPUs per node
#SBATCH --ntasks-per-node 1 # MPI ranks per node
#SBATCH --time 01:00:00

# Execute using:
# sbatch ./dawn-create-venv.sh

echo
echo "## Aurora create virtual environment script starting"

# Quit on error
set -e

pushd ../scripts

echo
echo "## Loading modules"

module purge
module load default-dawn
module load lua
module load intel-oneapi-ccl/2021.14.0
module load intel-oneapi-mpi/2021.14.1
module load intel-oneapi-mkl/2025.0.1

echo
echo "## Configuring environment"

VENV_DIR=../../dawn/environments/venv_3_11_11

echo
echo "## Initialising virtual environment"

python3.11 -m venv $VENV_DIR
. ${VENV_DIR}/bin/activate

echo
echo "## Details"
echo
echo "Nodes: ${SLURM_JOB_NUM_NODES}"
echo "GPUs per node: ${SLURM_GPUS_PER_NODE}"
echo "Tasks per node: ${SLURM_NTASKS_PER_NODE}"
echo "CPUS per task: ${SLURM_CPUS_PER_TASK}"
echo "Working directory: $(realpath ${PWD})"
echo "Location of venv: $(realpath $VENV_DIR)"

echo
echo "## Installing packages"

START=$(date +%s)
pip install --upgrade pip
pip install -e ../../.[dawn]
pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu
pip install --trusted-host pytorch-extension.intel.com intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
END=$(date +%s)
ELAPSED=$((${END}-${START}))

echo
echo "## Details post"
echo
echo "Time completed: $(date --iso-8601=ns)"
echo "Epoch start: ${START}"
echo "Epoch end: ${END}"
echo "Elapsed: ${ELAPSED} seconds"

echo
echo "## Tidying up"

deactivate
popd

echo
echo "## Aurora create virtual environmnent script completed"
Loading