alan-turing-institute
diff --git a/‎baskerville/dawn-comparison/README.md‎
Lines changed: 13 additions & 12 deletions b/‎baskerville/dawn-comparison/README.md‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎baskerville/dawn-comparison/batch-comparison.sh‎
Lines changed: 2 additions & 2 deletions b/‎baskerville/dawn-comparison/batch-comparison.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎baskerville/dawn-comparison/batch-inference-timing.sh‎
Lines changed: 7 additions & 7 deletions b/‎baskerville/dawn-comparison/batch-inference-timing.sh‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎baskerville/era5-experiments/README.md‎
Lines changed: 35 additions & 72 deletions b/‎baskerville/era5-experiments/README.md‎
Lines changed: 35 additions & 72 deletions
diff --git a/‎baskerville/era5-experiments/batch-download.sh‎
Lines changed: 4 additions & 4 deletions b/‎baskerville/era5-experiments/batch-download.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎baskerville/era5-experiments/batch-finetune-aligned.sh‎
Lines changed: 5 additions & 6 deletions b/‎baskerville/era5-experiments/batch-finetune-aligned.sh‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎baskerville/era5-experiments/batch-finetune-ddp.sh‎
Lines changed: 5 additions & 6 deletions b/‎baskerville/era5-experiments/batch-finetune-ddp.sh‎
Lines changed: 5 additions & 6 deletions
@@ -31,41 +31,42 @@ preds_{i}-bask.pkl
 ## Generating results and graphs
 
 To run the script to generate the results on Baskerville and output the graphs, use the following:
-```
+```bash
 sbatch batch-comparison.py
 ```
 
 ## Manual graph generation
 
 While working on the graphs it can be convenient to run the graph generation script manually.
 This can be done in an `srun` shell:
-```
+```bash
 srun --qos turing --account usjs9456-ati-test --time 10:00:00 --nodes 1 \\
   --gpus 1 --cpus-per-gpu 36 --mem 65536 --pty /bin/bash
 ```
 
 Then source the following file to set up the environment:
-```
+```bash
 . ./batch-srun.sh
 ```
 
-Finally run the graph generation script. Any errors will cause the `srun` session to abort, so we block error return values when running this for convenience during development.
-```
-python compare-results.py || true
+Finally run the graph generation script.
+The value 4 passed in as the `-n` parameter is the number of `preds` files to use.
+In general this should be left as four to match the files generated as explained above.
+```bash
+python compare-results.py -d "../../downloads" -i "pdf" -n 4
 ```
 
 ## Output graphs
 
-Graphs will be output in both PNG and PDF format, as the following files:
+Graphs will be output in in the format spacified on the command line for the `-i` parameter.
+If you followed the above steps these will be in PDF format (PNG and SVG are also supported).
 ```
 plot-errors.pdf
-plot-errors.png
+plot-error-comparison.pdf
 plot-losses.pdf
-plot-losses.png
 plot-pvg-bask.pdf
-plot-pvg-bask.png
 plot-pvg-dawn.pdf
-plot-pvg-dawn.png
+plot-std-dev-comparison.pdf
 plot-var-losses.pdf
-plot-var-losses.png
+plot-weatherbench-comparison.pdf
 ```
@@ -7,7 +7,7 @@
 #SBATCH --gpus 1
 #SBATCH --cpus-per-gpu 36
 #SBATCH --mem 0
-#SBATCH --job-name auroria-comparison
+#SBATCH --job-name aurora-comparison
 #SBATCH --output log-comparison.txt
 
 # Execute using:
@@ -19,7 +19,7 @@ echo "## Aurora comparison script starting"
 # Quit on error
 set -e
 
-if [ ! -d ../era5-experiments/downloads ]; then
+if [ ! -d ../../downloads ]; then
   echo "Please run the batch-download.sh script to download the data."
   exit 1
 fi
 
@@ -19,7 +19,7 @@ echo "## Aurora inference timing script starting"
 # Quit on error
 set -e
 
-if [ ! -d ../era5-experiments/downloads ]; then
+if [ ! -d ../../downloads ]; then
   echo "Please run the batch-download.sh script to download the data."
   exit 1
 fi
@@ -30,18 +30,15 @@ echo "## Loading modules"
 module -q purge
 module -q load baskerville
 module -q load bask-apps/live
-module -q load matplotlib/3.7.2-gfbf-2023a
 module -q load PyTorch-bundle/2.1.2-foss-2023a-CUDA-12.1.1
 
 echo
 echo "## Initialising virtual environment"
 
-python -m venv venv
+python3.11 -m venv venv
 . ./venv/bin/activate
 
 pip install --quiet --upgrade pip
-pip install --quiet cdsapi
-pip install --quiet microsoft-aurora
 pip install --quiet -e ../../.[bask]
 
 echo
@@ -53,11 +50,14 @@ vmstat -t 1 -y > log-comparison-cpu.txt &
 
 # Perform the prediction
 # do this 4 times, once per GPU
+unset WAITING
 for i in {0..3}; do
-    CUDA_VISIBLE_DEVICES=$i python inference-timing.py -n 28 --save -o preds_$i.pkl > inference_28_steps_$i.txt &
+    CUDA_VISIBLE_DEVICES=$i python inference-timing.py -n 28 -d ../../downloads --save -o preds_$i.pkl > inference_28_steps_$i.txt &
+    WAITING+=( $! );
 done
 
-wait
+# Wait only for the processes started in the for loop
+wait "${WAITING[@]}"
 
 echo
 echo "## Tidying up"
 
@@ -5,35 +5,56 @@ https://microsoft.github.io/aurora/example_era5.html
 ## Set up
 
 Clone the repository:
-```
+```bash
 git clone --recursive https://github.com/alan-turing-institute/aurora-hpc.git
 cd aurora-hpc/baskerville/era5-prediction
 ```
 
 Get your API key from the Climate Data Store (see the page linked above).
 Store it in the `cdsapi.config` file by running the following, replacing APIKEY with your actual API key.
 
-```
+```bash
 printf "%s%s\n" "$(cat cdsapi.config.example)" "APIKEY" > cdsapi.config
 ```
 
-## Download the data
+## Interactive session
 
+The instructions in the following sections explain how to run the experiments using queued tasks using `sbatch`.
+However many of these can also be run within an interactive `srun` session, which can be convenient during development.
+Setting up a session for use with the scripts can be done as follows.
+
+```bash
+srun --qos turing --account usjs9456-ati-test --time 1:00:00 \
+    --nodes 1 --gpus 1 --cpus-per-gpu 36 --mem 0 --pty /bin/bash
+. ./batch-srun.sh
 ```
+
+This will set up modules, environment and virtual environment.
+You can then run scripts directly, for example:
+
+```bash
+python download.py
+```
+
+## Download the data
+
+This will download the data to the `aurora-hpc/downloads` directory.
+
+```bash
 sbatch batch-download.sh
 ```
 
 ## Perform the prediction
 
-```
+```bash
 sbatch batch-runmodel.sh
 ```
 
 ## Display the resulting image
 
 Assuming you have X-forwarding enabled on your Baskerville session you can display the resulting image on your local machine by running the following.
 
-```
+```bash
 module load ImageMagick/7.1.0-37-GCCcore-11.3.0
 magick display plots.pdf
 ```
@@ -43,79 +64,21 @@ magick display plots.pdf
 For fine-tuning the same data download can be used.
 You can then immediately perform finetuning with the small (debug) modeul on a 40 GiB A100 with the following.
 
-```
+```bash
 sbatch batch-finetune-small.sh
 ```
 
 ## Fine-tuning the standard model
 
-Currently fine-tuning the standard model fails on an 80 GiB A100 GPU due to out-of-memory errors.
-You can try this yourself with the following:
+There are four versions of the fine tuning process for the standard model: DDP, FSDP, Aligned and a preliminary version.
+The last of these is for historical interest and shows the development of the process, but won't run on Baskerville A100 with80 GiB of memory due to out of memory errors.
+This preliminary version uses a simplified loss function rather than the loss function specified in the paper and which is likely to be the source of these errors.
 
-```
-sbatch batch-finetune.sh
-```
+To test out the different versious the following commands can be used:
 
-Alternatively to run the same fine-tuning code that works on DAWN, run the following:
-
-```
+```bash
+sbatch batch-finetune-ddp.sh
+sbatch batch-finetune-fsdp.sh
 sbatch batch-finetune-aligned.sh
-```
-
-The resulting errors looks like this:
-
-```log
-/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
-  return self.fget.__get__(instance, owner)()
-loading model...
-loading data...
-batching...
-preparing model...
-performing forward pass...
-calculating loss...
-performing backward pass...
-Traceback (most recent call last):
-  File "/bask/projects/u/usjs9456-ati-test/ovau2564/aurora/aurora-hpc/baskerville/era5-prediction/finetune-fsdp.py", line 88, in <module>
-    loss.backward()
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/_tensor.py", line 492, in backward
-    torch.autograd.backward(
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/autograd/__init__.py", line 251, in backward
-    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/autograd/function.py", line 288, in apply
-    return user_fn(self, *args)
-           ^^^^^^^^^^^^^^^^^^^^
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 271, in backward
-    outputs = ctx.run_function(*detached_inputs)
-              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 153, in my_function
-    return self._checkpoint_wrapped_module(
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
-    return self._call_impl(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
-    return forward_call(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/projects/u/usjs9456-ati-test/ovau2564/aurora/aurora-hpc/aurora/aurora/model/swin3d.py", line 722, in forward
-    x = blk(x, c, res, rollout_step)
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
-    return self._call_impl(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
-    return forward_call(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/projects/u/usjs9456-ati-test/ovau2564/aurora/aurora-hpc/aurora/aurora/model/swin3d.py", line 486, in forward
-    attn_windows = self.attn(x_windows, mask=attn_mask, rollout_step=rollout_step)
-                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
-    return self._call_impl(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/apps/live/EL8-ice/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
-    return forward_call(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/bask/projects/u/usjs9456-ati-test/ovau2564/aurora/aurora-hpc/aurora/aurora/model/swin3d.py", line 161, in forward
-    x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=attn_dropout)
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 570.00 MiB. GPU 0 has a total capacty of 79.25 GiB of which 103.50 MiB is free. Including non-PyTorch memory, this process has 79.14 GiB memory in use. Of the allocated memory 76.38 GiB is allocated by PyTorch, and 2.24 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
+sbatch batch-finetune.sh
 ```
@@ -6,8 +6,8 @@
 #SBATCH --nodes 1
 #SBATCH --gpus 1
 #SBATCH --cpus-per-gpu 36
-#SBATCH --job-name auroria-prepare
-#SBATCH --output log-prepare.txt
+#SBATCH --job-name aurora-prepare
+#SBATCH --output log-download.txt
 
 # Execute using:
 # sbatch ./batch-prepare.sh
@@ -37,12 +37,12 @@ module -q load PyTorch-bundle/2.1.2-foss-2023a-CUDA-12.1.1
 echo
 echo "## Initialising virtual environment"
 
-python -m venv venv
+python3.11 -m venv venv
 . ./venv/bin/activate
 
 pip install --quiet --upgrade pip
 pip install --quiet cdsapi
-pip install --quiet -e ../../aurora
+pip install --quiet -e ../../.[bask]
 
 echo
 echo "## Downloading data"
 
@@ -7,8 +7,8 @@
 #SBATCH --gpus 1
 #SBATCH --cpus-per-gpu 36
 #SBATCH --constraint=a100_80
-#SBATCH --job-name auroria-finetune
-#SBATCH --output log-finetune.txt
+#SBATCH --job-name aurora-finetune-aligned
+#SBATCH --output log-finetune-aligned.txt
 
 # Execute using:
 # sbatch ./batch-finetune.sh
@@ -19,7 +19,7 @@ echo "## Aurora fine-tuning script starting"
 # Quit on error
 set -e
 
-if [ ! -d downloads ]; then
+if [ ! -d ../../downloads ]; then
   echo "Please run the batch-download.sh script to download the data."
   exit 1
 fi
@@ -36,12 +36,11 @@ module -q load PyTorch-bundle/2.1.2-foss-2023a-CUDA-12.1.1
 echo
 echo "## Initialising virtual environment"
 
-python -m venv venv
+python3.11 -m venv venv
 . ./venv/bin/activate
 
 pip install --quiet --upgrade pip
-pip install --quiet cdsapi
-pip install --quiet -e ../../aurora
+pip install --quiet -e ../../.[bask]
 
 echo
 echo "## Running model"
 
@@ -8,8 +8,8 @@
 #SBATCH --cpus-per-gpu 36
 #SBATCH --mem 32768
 #SBATCH --constraint=a100_80
-#SBATCH --job-name auroria-finetune
-#SBATCH --output log-finetune.txt
+#SBATCH --job-name aurora-finetune-ddp
+#SBATCH --output log-finetune-ddp.txt
 
 # Execute using:
 # sbatch ./batch-finetune.sh
@@ -20,7 +20,7 @@ echo "## Aurora fine-tuning script starting"
 # Quit on error
 set -e
 
-if [ ! -d downloads ]; then
+if [ ! -d ../../downloads ]; then
   echo "Please run the batch-download.sh script to download the data."
   exit 1
 fi
@@ -42,12 +42,11 @@ export OMP_NUM_THREADS=1
 echo
 echo "## Initialising virtual environment"
 
-python -m venv venv
+python3.11 -m venv venv
 . ./venv/bin/activate
 
 pip install --quiet --upgrade pip
-pip install --quiet cdsapi
-pip install --quiet -e ../../aurora
+pip install --quiet -e ../../.[bask]
 
 echo
 echo "## Running model"