Skip to content

Commit 3ca9c84

Browse files
committed
[spark-rapids] generate actions from templates
1 parent aa792c3 commit 3ca9c84

File tree

4 files changed

+257
-92
lines changed

4 files changed

+257
-92
lines changed

spark-rapids/test_mig.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import os
2+
import time
3+
4+
import pkg_resources
5+
from absl.testing import absltest
6+
from absl.testing import parameterized
7+
8+
from integration_tests.dataproc_test_case import DataprocTestCase
9+
10+
class MigTestCase(DataprocTestCase):
11+
COMPONENT = "rapids"
12+
INIT_ACTIONS = ["spark-rapids/mig.sh"]
13+
14+
GPU_H100 = "type=nvidia-h100-80gb,count=2"
15+
default_machine_type = "n1-standard-32"
16+
17+
def verify_mig_instance(self, name):
18+
self.assert_instance_command(name,
19+
"/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
20+
21+
@parameterized.parameters( ("SINGLE", ["m"], GPU_H100, None, "NVIDIA", "us-central1-c")
22+
# ("STANDARD", ["m", "w-0", "w-1"], None, self.GPU_H100, "NVIDIA", "us-central1-c")
23+
# ("KERBEROS", ["m", "w-0", "w-1"], None, self.GPU_H100, "NVIDIA", "us-central1-c")
24+
)
25+
def test_install_gpu_with_mig(self, configuration, machine_suffixes,
26+
master_accelerator, worker_accelerator,
27+
driver_provider, zone):
28+
29+
if configuration == 'SINGLE' and master_accelerator == None:
30+
master_accelerator=self.GPU_H100
31+
32+
self.createCluster(
33+
configuration,
34+
self.INIT_ACTIONS,
35+
zone=zone,
36+
master_machine_type="a3-highgpu-2g" if master_accelerator == self.GPU_H100 else self.default_machine_type,
37+
worker_machine_type="a3-highgpu-2g" if worker_accelerator == self.GPU_H100 else self.default_machine_type,
38+
master_accelerator=master_accelerator,
39+
worker_accelerator=worker_accelerator,
40+
metadata=None,
41+
timeout_in_minutes=30,
42+
network_interface="nic-type=GVNIC,address=,network=default",
43+
boot_disk_size="40GB",
44+
)
45+
46+
for machine_suffix in machine_suffixes:
47+
self.verify_mig_instance("{}-{}".format(self.getClusterName(),
48+
machine_suffix))

spark-rapids/test_spark_rapids.py

Lines changed: 23 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import time
23

34
import pkg_resources
45
from absl.testing import absltest
@@ -11,31 +12,27 @@ class SparkRapidsTestCase(DataprocTestCase):
1112
INIT_ACTIONS = ["spark-rapids/spark-rapids.sh"]
1213

1314
GPU_T4 = "type=nvidia-tesla-t4"
14-
GPU_A100 = "type=nvidia-tesla-a100"
15+
default_machine_type = "n1-highmem-8"
1516

1617
# Tests for RAPIDS init action
1718
XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark_rapids.scala"
1819
XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark_rapids_sql.scala"
20+
cmd_template="""echo :quit | spark-shell \
21+
--conf spark.executor.resource.gpu.amount=1 \
22+
--conf spark.task.resource.gpu.amount=1 \
23+
--conf spark.dynamicAllocation.enabled=false -i {}"""
1924

2025
def verify_spark_instance(self, name):
2126
self.assert_instance_command(name, "nvidia-smi")
2227

23-
def verify_mig_instance(self, name):
24-
self.assert_instance_command(name,
25-
"/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
26-
2728
def verify_spark_job(self):
2829
instance_name = "{}-m".format(self.getClusterName())
2930
self.upload_test_file(
3031
os.path.join(
3132
os.path.dirname(os.path.abspath(__file__)),
3233
self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME), instance_name)
3334
self.assert_instance_command(
34-
instance_name, """echo :quit | spark-shell \
35-
--conf spark.executor.resource.gpu.amount=1 \
36-
--conf spark.task.resource.gpu.amount=1 \
37-
--conf spark.dynamicAllocation.enabled=false -i {}""".format(
38-
self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME))
35+
instance_name, self.cmd_template.format(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME))
3936
self.remove_test_script(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME,
4037
instance_name)
4138

@@ -46,24 +43,14 @@ def verify_spark_job_sql(self):
4643
os.path.dirname(os.path.abspath(__file__)),
4744
self.XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME), instance_name)
4845
self.assert_instance_command(
49-
instance_name, """echo :quit | spark-shell \
50-
--conf spark.executor.resource.gpu.amount=1 \
51-
--conf spark.task.resource.gpu.amount=1 \
52-
--conf spark.dynamicAllocation.enabled=false -i {}""".format(
53-
self.XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME))
46+
instance_name, self.cmd_template.format(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME))
5447
self.remove_test_script(self.XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME,
5548
instance_name)
5649

5750
@parameterized.parameters(("SINGLE", ["m"], GPU_T4),
5851
("STANDARD", ["w-0"], GPU_T4))
5952
def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
6053

61-
if self.getImageOs() == "rocky":
62-
self.skipTest("Not supported for Rocky OS")
63-
64-
if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
65-
self.skipTest("Not supported in 2.0 and earlier images")
66-
6754
optional_components = None
6855
metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
6956

@@ -72,106 +59,50 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
7259
self.INIT_ACTIONS,
7360
optional_components=optional_components,
7461
metadata=metadata,
75-
machine_type="n1-standard-4",
62+
machine_type=self.default_machine_type,
7663
master_accelerator=accelerator if configuration == "SINGLE" else None,
7764
worker_accelerator=accelerator,
78-
boot_disk_size="50GB",
65+
boot_disk_size="40GB",
7966
timeout_in_minutes=30)
8067

8168
for machine_suffix in machine_suffixes:
8269
self.verify_spark_instance("{}-{}".format(self.getClusterName(),
8370
machine_suffix))
84-
# Only need to do this once
85-
self.verify_spark_job()
86-
87-
@parameterized.parameters(("SINGLE", ["m"], GPU_T4),
88-
("STANDARD", ["w-0"], GPU_T4))
89-
def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
9071

91-
if self.getImageOs() == "rocky":
92-
self.skipTest("Not supported for Rocky OS")
93-
94-
if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
95-
self.skipTest("Not supported in 2.0 and earlier images")
96-
97-
optional_components = None
98-
metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
99-
100-
self.createCluster(
101-
configuration,
102-
self.INIT_ACTIONS,
103-
optional_components=optional_components,
104-
metadata=metadata,
105-
machine_type="n1-standard-4",
106-
master_accelerator=accelerator if configuration == "SINGLE" else None,
107-
worker_accelerator=accelerator,
108-
boot_disk_size="50GB",
109-
timeout_in_minutes=30)
110-
111-
for machine_suffix in machine_suffixes:
112-
self.verify_spark_instance("{}-{}".format(self.getClusterName(),
113-
machine_suffix))
114-
# Only need to do this once
115-
self.verify_spark_job_sql()
72+
if ( self.getImageOs() == 'rocky' ) \
73+
and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
74+
and configuration == 'SINGLE':
75+
print("skipping spark job test ; 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail")
76+
else:
77+
# Only need to do this once
78+
self.verify_spark_job()
79+
# Only need to do this once
80+
self.verify_spark_job_sql()
11681

11782
@parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"))
11883
def test_non_default_cuda_versions(self, configuration, machine_suffixes,
11984
accelerator, cuda_version, driver_version):
12085

121-
if self.getImageOs() == "rocky":
122-
self.skipTest("Not supported for Rocky OS")
123-
124-
if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
125-
self.skipTest("Not supported in 2.0 and earlier images")
126-
12786
metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
12887
",cuda-version={0},driver-version={1}".format(cuda_version, driver_version))
12988

13089
self.createCluster(
13190
configuration,
13291
self.INIT_ACTIONS,
13392
metadata=metadata,
134-
machine_type="n1-standard-4",
93+
machine_type="n1-standard-32",
13594
master_accelerator=accelerator if configuration == "SINGLE" else None,
13695
worker_accelerator=accelerator,
137-
boot_disk_size="50GB",
96+
boot_disk_size="40GB",
13897
timeout_in_minutes=30)
13998

14099
for machine_suffix in machine_suffixes:
141100
self.verify_spark_instance("{}-{}".format(self.getClusterName(),
142101
machine_suffix))
143102
# Only need to do this once
144103
self.verify_spark_job()
145-
146-
# Disable MIG related test due to the lack of A100 GPUs, more detail see
147-
# https://github.com/GoogleCloudDataproc/initialization-actions/pull/1070
148-
149-
# @parameterized.parameters(("STANDARD", ["m", "w-0", "w-1"], None, GPU_A100, "NVIDIA", "us-central1-c"))
150-
# def test_install_gpu_with_mig(self, configuration, machine_suffixes,
151-
# master_accelerator, worker_accelerator,
152-
# driver_provider, zone):
153-
# if self.getImageVersion() < pkg_resources.parse_version("2.0") or self.getImageOs() == "rocky":
154-
# self.skipTest("Not supported in pre 2.0 or Rocky images")
155-
#
156-
# if self.getImageVersion() == pkg_resources.parse_version("2.1"):
157-
# self.skipTest("Not supported in 2.1 images")
158-
#
159-
# self.createCluster(
160-
# configuration,
161-
# self.INIT_ACTIONS,
162-
# zone=zone,
163-
# master_machine_type="n1-standard-4",
164-
# worker_machine_type="a2-highgpu-1g",
165-
# master_accelerator=master_accelerator,
166-
# worker_accelerator=worker_accelerator,
167-
# metadata=None,
168-
# timeout_in_minutes=30,
169-
# boot_disk_size="200GB",
170-
# startup_script="spark-rapids/mig.sh")
171-
#
172-
# for machine_suffix in ["w-0", "w-1"]:
173-
# self.verify_mig_instance("{}-{}".format(self.getClusterName(),
174-
# machine_suffix))
104+
# Only need to do this once
105+
self.verify_spark_job_sql()
175106

176107
if __name__ == "__main__":
177108
absltest.main()

templates/spark-rapids/mig.sh.in

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#!/bin/bash
2+
#
3+
[% INSERT legal/license_header %]
4+
#
5+
# This script installs NVIDIA GPU drivers and enables MIG on Hopper
6+
# GPU architectures.
7+
#
8+
# This script should be specified in --initialization-actions= option
9+
# and --metadata=ENABLE_MIG can be used to enable or disable MIG. The
10+
# default is to enable it. The script configures the MIG device based
11+
# on the user specified MIG_CGI profiles specified via:
12+
# --metadata=^:^MIG_CGI='9,9'. If MIG_CGI is not specified it assumes
13+
# it's using an H100 and configures 2 instances with profile id 9.
14+
#
15+
[% PROCESS common/template_disclaimer %]
16+
[% INSERT common/util_functions %]
17+
[% INSERT common/install_functions %]
18+
[% INSERT common/yarn_functions %]
19+
[% INSERT gpu/mig_functions %]
20+
[% INSERT gpu/util_functions %]
21+
[% INSERT gpu/install_functions %]
22+
[% INCLUDE gpu/yarn_functions %]
23+
[% INSERT gpu/spark_functions %]
24+
25+
set -euxo pipefail
26+
27+
function main() {
28+
if [[ "${nvsmi_works}" == "1" ]] ; then
29+
# if this is called without the MIG script then the drivers are not installed
30+
query_nvsmi
31+
local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
32+
set +e
33+
migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
34+
set -e
35+
NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
36+
37+
if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
38+
if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
39+
if (echo "${migquery_result}" | grep Enabled); then
40+
IS_MIG_ENABLED=1
41+
NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
42+
MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
43+
fetch_mig_scripts
44+
fi
45+
fi
46+
fi
47+
fi
48+
49+
# if mig is enabled drivers would have already been installed
50+
if [[ $IS_MIG_ENABLED -eq 0 ]]; then
51+
install_nvidia_gpu_driver
52+
install_cuda
53+
load_kernel_module
54+
55+
#Install GPU metrics collection in Stackdriver if needed
56+
if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
57+
install_gpu_agent
58+
# install_gpu_monitoring_agent
59+
echo 'GPU metrics agent successfully deployed.'
60+
else
61+
echo 'GPU metrics agent has not been installed.'
62+
fi
63+
configure_gpu_exclusive_mode
64+
fi
65+
66+
setup_gpu_yarn
67+
68+
echo "yarn setup complete"
69+
70+
enable_and_configure_mig
71+
72+
echo "main complete"
73+
return 0
74+
}
75+
76+
function exit_handler() {
77+
set +e
78+
gpu_install_exit_handler
79+
gpu_exit_handler
80+
pip_exit_handler
81+
yarn_exit_handler
82+
common_exit_handler
83+
return 0
84+
}
85+
86+
function prepare_to_install(){
87+
prepare_spark_env
88+
prepare_common_env
89+
prepare_pip_env
90+
prepare_gpu_env
91+
prepare_gpu_install_env
92+
trap exit_handler EXIT
93+
}
94+
95+
prepare_to_install
96+
97+
main

0 commit comments

Comments
 (0)