Skip to content

Commit c56efe7

Browse files
author
Riyaz Haque
committed
Merge remote-tracking branch 'origin/develop' into features/branson
2 parents 7c02c66 + 15c1000 commit c56efe7

File tree

7 files changed

+132
-55
lines changed

7 files changed

+132
-55
lines changed

.gitlab-ci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ variables:
4747
FLUX_ARGS: -N 1 --exclusive
4848
# Dane
4949
DANE_SHARED_ALLOC: -N 1 -t 04:00:00 --exclusive --reservation=ci
50+
# Matrix (NODES ARE SHARED WITHOUT --exclusive)
51+
MATRIX_SHARED_ALLOC: -N 1 -t 04:00:00 --exclusive
5052
# Tuolumne (max 4hr policy)
5153
TUOLUMNE_SHARED_ALLOC: -N 1 -t 2h --queue=pbatch --exclusive --setattr=gpumode=${GPUMODE}
5254
--conf=resource.rediscover=true

.gitlab/tests/nightly.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ include:
55
variables:
66
DASHBOARD_NAME: Nightly [benchpark-develop]
77
DANE_PARAMS: -N 1 -t 01:00:00 --exclusive --reservation=ci
8+
MATRIX_PARAMS: -N 1 -t 01:00:00 --exclusive
89
ELCAP_PARAMS: -N 1 -t 60m --queue=pci --exclusive
910
parallel:
1011
matrix:
@@ -60,6 +61,22 @@ include:
6061
- raja-perf
6162
- salmon-tddft
6263
VARIANT: [+openmp]
64+
# +cuda tests
65+
- HOST: matrix
66+
ARCHCONFIG: llnl-matrix
67+
SCHEDULER_PARAMETERS: $MATRIX_PARAMS
68+
BENCHMARK:
69+
- amg2023
70+
- babelstream
71+
- gromacs
72+
- kripke
73+
- laghos
74+
- lammps
75+
- osu-micro-benchmarks
76+
- raja-perf
77+
- remhos
78+
- saxpy
79+
VARIANT: [+cuda]
6380
#############
6481
# Other tests
6582
#############

.gitlab/tests/shared_slurm_clusters.yml

Lines changed: 76 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,35 +9,83 @@ workflow:
99
#############################
1010
# Resource Allocation Section
1111
#############################
12-
allocate_resources_dane:
13-
tags: [shell, dane]
14-
needs: [dane-up-check]
12+
.allocate_resources_slurm:
1513
extends: .on_host_template
1614
stage: allocate-resources
1715
script:
18-
- sbatch ${DANE_SHARED_ALLOC} --wait-all-nodes=1 --job-name=${ALLOC_NAME} --wrap="srun
19-
-N 1 -n 1 flux start sleep inf"
16+
- set -e
17+
# submit and grab JobID
18+
- JOBID=$(sbatch --parsable ${SHARED_ALLOC} --wait-all-nodes=1 --job-name="${ALLOC_NAME}"
19+
--wrap='srun -N 1 -n 1 flux start sleep inf')
20+
- echo "Slurm job = ${JOBID}"
21+
# wait until SLURM job running
22+
# 1..2160 is up to 6 hours
23+
- for i in {1..2160}; do state=$(sacct -j ${JOBID} -n -o State 2>/dev/null | head
24+
-1 | awk '{print $1}'); [[ "$state" == "RUNNING" ]] && break; echo "waiting...
25+
total duration has been $(( (i-1)*10 ))s" && sleep 10; done
26+
#- for i in {1..1080}; do flux proxy slurm:${JOBID} flux ping -c1 >/dev/null 2>&1 && break; echo "waiting..." sleep 10; done
27+
#- flux proxy slurm:${JOBID} flux ping -c1 # final check (fails the job if still not ready)
2028
after_script:
21-
- |-
29+
- |
2230
if [[ "$CI_JOB_STATUS" == "canceled" ]]; then
2331
. .gitlab/utils/cancel-slurm.sh
2432
fi
33+
allocate_resources_dane:
34+
extends: .allocate_resources_slurm
35+
variables:
36+
HOST: dane
37+
SHARED_ALLOC: $DANE_SHARED_ALLOC
38+
tags: [shell, dane]
39+
needs: [dane-up-check]
40+
allocate_resources_matrix:
41+
extends: .allocate_resources_slurm
42+
variables:
43+
HOST: matrix
44+
SHARED_ALLOC: $MATRIX_SHARED_ALLOC
45+
tags: [shell, matrix]
46+
needs: [matrix-up-check]
2547
##########################
2648
# Resource Release Section
2749
##########################
28-
release_resources_dane:
29-
tags: [shell, dane]
30-
needs: [allocate_resources_dane, run_tests_slurm_dane]
50+
.release_resources_slurm:
3151
extends: .on_host_template
3252
stage: release-resources
3353
# Do not cleanup dir yet
34-
script:
35-
- . .gitlab/utils/cancel-slurm.sh --no-clean
54+
script: [. .gitlab/utils/cancel-slurm.sh --no-clean]
55+
release_resources_dane:
56+
extends: .release_resources_slurm
57+
tags: [shell, dane]
58+
needs: [allocate_resources_dane, run_tests_slurm_dane]
59+
release_resources_matrix:
60+
extends: .release_resources_slurm
61+
tags: [shell, matrix]
62+
needs: [allocate_resources_matrix, run_tests_slurm_matrix]
3663
##################
3764
# Test Run Section
3865
##################
39-
.run_tests_slurm: &run_tests_slurm
66+
.run_tests_slurm:
4067
extends: .on_host_template
68+
stage: test
69+
before_script:
70+
- !reference [.report_status, before_script]
71+
script:
72+
- |
73+
echo -e "### Allocation name is ${ALLOC_NAME}"
74+
export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A)
75+
echo -e "### Job ID is ${JOBID}"
76+
- |
77+
PROXY="$( [[ -n "${JOBID}" ]] && echo "flux proxy slurm:${JOBID}" || echo "" )"
78+
${PROXY} flux watch $( ${PROXY} flux batch -o output.stdout.type=kvs ${FLUX_ARGS} .gitlab/utils/run-experiment.sh )
79+
after_script:
80+
- !reference [.report_status, after_script]
81+
- |
82+
if [[ "$CI_JOB_STATUS" == "canceled" ]]; then
83+
. .gitlab/utils/cancel-slurm.sh
84+
fi
85+
run_tests_slurm_dane:
86+
extends: .run_tests_slurm
87+
tags: [shell, dane]
88+
needs: [allocate_resources_dane]
4189
parallel:
4290
matrix:
4391
- HOST: dane
@@ -58,24 +106,19 @@ release_resources_dane:
58106
ARCHCONFIG: llnl-cluster
59107
BENCHMARK: [kripke]
60108
VARIANT: ['caliper=mpi,time hwloc=on affinity=on']
61-
run_tests_slurm_dane:
62-
tags: [shell, dane]
63-
needs: [allocate_resources_dane]
64-
stage: test
65-
<<: *run_tests_slurm
66-
before_script:
67-
- !reference [.report_status, before_script]
68-
script:
69-
- |
70-
echo -e "### Allocation name is ${ALLOC_NAME}"
71-
export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A)
72-
echo -e "### Job ID is ${JOBID}"
73-
- |
74-
PROXY="$( [[ -n "${JOBID}" ]] && echo "flux proxy slurm:${JOBID}" || echo "" )"
75-
${PROXY} flux watch $( ${PROXY} flux batch -o output.stdout.type=kvs ${FLUX_ARGS} .gitlab/utils/run-experiment.sh )
76-
after_script:
77-
- !reference [.report_status, after_script]
78-
- |-
79-
if [[ "$CI_JOB_STATUS" == "canceled" ]]; then
80-
. .gitlab/utils/cancel-slurm.sh
81-
fi
109+
run_tests_slurm_matrix:
110+
extends: .run_tests_slurm
111+
tags: [shell, matrix]
112+
needs: [allocate_resources_matrix]
113+
parallel:
114+
matrix:
115+
- HOST: matrix
116+
ARCHCONFIG: llnl-matrix
117+
BENCHMARK:
118+
- amg2023
119+
- babelstream
120+
- kripke
121+
- laghos
122+
- raja-perf
123+
- saxpy
124+
VARIANT: [+cuda]

.gitlab/utils/machine_checks.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,8 @@ dane-up-check:
2828
variables:
2929
CI_MACHINE: dane
3030
extends: [.machine-check]
31+
matrix-up-check:
32+
stage: machine-checks
33+
variables:
34+
CI_MACHINE: matrix
35+
extends: [.machine-check]

.gitlab/utils/run-experiment.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ set -e
44
# Activate Virtual Environment
55
. /usr/workspace/benchpark-dev/benchpark-venv/$SYS_TYPE/bin/activate
66

7-
echo "./bin/benchpark system init --dest=${HOST} ${ARCHCONFIG} cluster=$HOST $SYSTEM_ARGS"
7+
echo "./bin/benchpark system init --dest=${HOST} ${ARCHCONFIG} $([ "$HOST" != "matrix" ] && echo "cluster=$HOST") $SYSTEM_ARGS"
88
echo "./bin/benchpark experiment init --dest=${BENCHMARK} --system=${HOST} ${BENCHMARK} ${VARIANT}"
99
echo "./bin/benchpark setup ${BENCHMARK} wkp/"
1010
echo ". wkp/setup.sh"
@@ -14,7 +14,7 @@ echo "ramble --disable-logger --workspace-dir . on --executor '{execute_experime
1414
echo "ramble --disable-logger --workspace-dir . workspace analyze --format json yaml text"
1515

1616
# Initialize System
17-
./bin/benchpark system init --dest=${HOST} ${ARCHCONFIG} cluster=$HOST $SYSTEM_ARGS
17+
./bin/benchpark system init --dest=${HOST} ${ARCHCONFIG} $([ "$HOST" != "matrix" ] && echo "cluster=$HOST") $SYSTEM_ARGS
1818

1919
# Initialize Experiment
2020
BV=""

docs/add-a-system-config.rst

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ this function, followed by our AWS example:
237237
``compiler_section_for()``.
238238
3. Merge the compiler definitions with merge_dicts (this part is unnecessary if you have
239239
only one type of compiler).
240-
4. Generally, you will want to compose a minimal list of compilers: e.g. if you want to
240+
4. Generally, you will want to compose a minimal list of compilers: e.g., if you want to
241241
compile your benchmark with the oneAPI compiler, and have multiple versions to choose
242242
from, you would add a variant to the system, and the config would expose only one of
243243
them.
@@ -247,9 +247,9 @@ For our AWS system, the compiler we define is ``[email protected]``. For the
247247

248248
1. ``spec`` - Similar to package specs, ``name@version``. GCC in particular also needs
249249
the ``languages`` variant, where the list of languages depends on the available
250-
``exes`` (e.g. do not include "fortran" if ``gfortran`` is not available). If you are
251-
**not** using GCC or Spack as your package manager, ``languages`` is unnecessary.
252-
2. ``prefix`` - Prefix to the compiler binary directory, e.g. ``/usr/`` for
250+
``exes`` (e.g., do not include "fortran" if ``gfortran`` is not available). If you
251+
are **not** using GCC or Spack as your package manager, ``languages`` is unnecessary.
252+
2. ``prefix`` - Prefix to the compiler binary directory, e.g., ``/usr/`` for
253253
``/usr/bin/gcc``
254254
3. ``exes`` - Dictionary to map ``c``, ``cxx``, and ``fortran`` to the appropriate file
255255
found in the prefix.
@@ -320,23 +320,27 @@ instead. For each package that you include, you need to define its spec ``name@v
320320
and the system path ``prefix`` to the package. Additionally for Spack, you need to set
321321
``buildable: False`` to tell Spack not to build that package.
322322

323-
At minimum, we recommend you define externals for ``cmake`` and ``mpi`` (users also
324-
typically define externals for other libraries, e.g. math libraries like ``blas`` and
325-
``lapack``). This is because certain packages (e.g. ``cmake``) can take a long time to
323+
At minimum, we recommend to define externals for ``cmake`` and ``mpi`` (users also
324+
typically define externals for other libraries, e.g., math libraries like ``blas`` and
325+
``lapack``). This is because certain packages (e.g., ``cmake``) can take a long time to
326326
build, and packages such as ``mpi``, ``blas``, and ``lapack`` can influence runtime
327-
performance significantly. Additionally, for systems with accelerators, define externals
328-
for CUDA and ROCm runtime libraries (see externals examples for a `CUDA system
327+
performance significantly so it is prudent to use the versions optimized for our system.
328+
Additionally, for systems with accelerators, define externals for CUDA and ROCm runtime
329+
libraries (see externals examples for a `CUDA system
329330
<https://github.com/LLNL/benchpark/blob/e82e3a26aef54855cf281c088b8f149ab7d87d9d/systems/llnl-matrix/system.py#L274>`_,
330331
or a `ROCm system
331332
<https://github.com/LLNL/benchpark/blob/e82e3a26aef54855cf281c088b8f149ab7d87d9d/systems/llnl-elcapitan/system.py#L483>`_).
332-
Also, see :ref:`adding-sys-packages`, for help on how to search for the packages
333-
available on your system.
333+
See :ref:`adding-sys-packages`, for help on how to search for the packages available on
334+
your system.
334335

335336
.. note::
336337

337-
For ``mpi``, you need to define ``"mpi": {"buildable": False},`` as a virtual
338-
package, and then define your MPI package as we have for the ``openmpi`` package.
339-
This is to ensure Spack uses our MPI, and does not try to build another MPI package.
338+
For packages that declare virtual dependencies, e.g., ``depends_on("mpi")``, you
339+
need to define a virtual package ``"mpi": {"buildable": False},``, followed by a
340+
definition of at least one provider of this package (see the provider definition for
341+
``openmpi`` in our example). This is to ensure Spack uses the provider we specified,
342+
and does not try to build another MPI package. See a similar example for ``blas``,
343+
``lapack``, and their provider ``atlas``.
340344

341345
::
342346

@@ -354,6 +358,11 @@ available on your system.
354358
def compute_packages_section(self):
355359
return {
356360
"packages": {
361+
"blas": {"buildable": False},
362+
"lapack": {"buildable": False},
363+
"atlas": {
364+
"externals": [{"spec": "[email protected]", "prefix": "/usr"}],
365+
},
357366
"mpi": {"buildable": False},
358367
"openmpi": {
359368
"externals": [
@@ -364,7 +373,7 @@ available on your system.
364373
]
365374
},
366375
"cmake": {
367-
"externals": [{"spec": "cmake@4.0.2", "prefix": "/usr"}],
376+
"externals": [{"spec": "cmake@4.1.1", "prefix": "/usr"}],
368377
"buildable": False,
369378
},
370379
...
@@ -462,7 +471,7 @@ cluster=<cluster>``:
462471
'tar': {'buildable': False,
463472
'externals': [{'prefix': '/usr', 'spec': '[email protected]'}]}}
464473

465-
where the command should be ran on a cluster that is defined for the given system, e.g.
474+
where the command should be ran on a cluster that is defined for the given system, e.g.,
466475
ruby for llnl-cluster. Use this output to update your package definitions in your
467476
``system.py``'s ``compute_package_section()``.
468477

systems/aws-tutorial/system.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,10 @@ def compute_packages_section(self):
7575
"buildable": False,
7676
},
7777
"gmake": {"externals": [{"spec": "[email protected]", "prefix": "/usr"}]},
78-
"lapack": {
79-
"externals": [{"spec": "[email protected]", "prefix": "/usr"}],
80-
"buildable": False,
78+
"blas": {"buildable": False},
79+
"lapack": {"buildable": False},
80+
"atlas": {
81+
"externals": [{"spec": "[email protected]", "prefix": "/usr"}],
8182
},
8283
"mpi": {"buildable": False},
8384
"openmpi": {
@@ -89,7 +90,7 @@ def compute_packages_section(self):
8990
]
9091
},
9192
"cmake": {
92-
"externals": [{"spec": "cmake@4.0.2", "prefix": "/usr"}],
93+
"externals": [{"spec": "cmake@4.1.1", "prefix": "/usr"}],
9394
"buildable": False,
9495
},
9596
"git": {

0 commit comments

Comments
 (0)