Skip to content

Commit 1d790be

Browse files
committed
[spark-rapids] generate actions from templates
1 parent aa792c3 commit 1d790be

File tree

3 files changed

+208
-45
lines changed

3 files changed

+208
-45
lines changed

spark-rapids/test_spark_rapids.py

Lines changed: 22 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,15 @@ class SparkRapidsTestCase(DataprocTestCase):
1212

1313
GPU_T4 = "type=nvidia-tesla-t4"
1414
GPU_A100 = "type=nvidia-tesla-a100"
15+
default_machine_type = "n1-highmem-8"
1516

1617
# Tests for RAPIDS init action
1718
XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark_rapids.scala"
1819
XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark_rapids_sql.scala"
20+
cmd_template="""echo :quit | spark-shell \
21+
--conf spark.executor.resource.gpu.amount=1 \
22+
--conf spark.task.resource.gpu.amount=1 \
23+
--conf spark.dynamicAllocation.enabled=false -i {}"""
1924

2025
def verify_spark_instance(self, name):
2126
self.assert_instance_command(name, "nvidia-smi")
@@ -31,11 +36,7 @@ def verify_spark_job(self):
3136
os.path.dirname(os.path.abspath(__file__)),
3237
self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME), instance_name)
3338
self.assert_instance_command(
34-
instance_name, """echo :quit | spark-shell \
35-
--conf spark.executor.resource.gpu.amount=1 \
36-
--conf spark.task.resource.gpu.amount=1 \
37-
--conf spark.dynamicAllocation.enabled=false -i {}""".format(
38-
self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME))
39+
instance_name, self.cmd_template.format(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME))
3940
self.remove_test_script(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME,
4041
instance_name)
4142

@@ -46,11 +47,7 @@ def verify_spark_job_sql(self):
4647
os.path.dirname(os.path.abspath(__file__)),
4748
self.XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME), instance_name)
4849
self.assert_instance_command(
49-
instance_name, """echo :quit | spark-shell \
50-
--conf spark.executor.resource.gpu.amount=1 \
51-
--conf spark.task.resource.gpu.amount=1 \
52-
--conf spark.dynamicAllocation.enabled=false -i {}""".format(
53-
self.XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME))
50+
instance_name, self.cmd_template.format(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME))
5451
self.remove_test_script(self.XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME,
5552
instance_name)
5653

@@ -72,47 +69,25 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
7269
self.INIT_ACTIONS,
7370
optional_components=optional_components,
7471
metadata=metadata,
75-
machine_type="n1-standard-4",
72+
machine_type=self.default_machine_type,
7673
master_accelerator=accelerator if configuration == "SINGLE" else None,
7774
worker_accelerator=accelerator,
78-
boot_disk_size="50GB",
75+
boot_disk_size="40GB",
7976
timeout_in_minutes=30)
8077

8178
for machine_suffix in machine_suffixes:
8279
self.verify_spark_instance("{}-{}".format(self.getClusterName(),
8380
machine_suffix))
84-
# Only need to do this once
85-
self.verify_spark_job()
8681

87-
@parameterized.parameters(("SINGLE", ["m"], GPU_T4),
88-
("STANDARD", ["w-0"], GPU_T4))
89-
def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
90-
91-
if self.getImageOs() == "rocky":
92-
self.skipTest("Not supported for Rocky OS")
93-
94-
if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
95-
self.skipTest("Not supported in 2.0 and earlier images")
96-
97-
optional_components = None
98-
metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
99-
100-
self.createCluster(
101-
configuration,
102-
self.INIT_ACTIONS,
103-
optional_components=optional_components,
104-
metadata=metadata,
105-
machine_type="n1-standard-4",
106-
master_accelerator=accelerator if configuration == "SINGLE" else None,
107-
worker_accelerator=accelerator,
108-
boot_disk_size="50GB",
109-
timeout_in_minutes=30)
110-
111-
for machine_suffix in machine_suffixes:
112-
self.verify_spark_instance("{}-{}".format(self.getClusterName(),
113-
machine_suffix))
114-
# Only need to do this once
115-
self.verify_spark_job_sql()
82+
if ( self.getImageOs() == 'rocky' ) \
83+
and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
84+
and configuration == 'SINGLE':
85+
print("skipping spark job test ; 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail")
86+
else:
87+
# Only need to do this once
88+
self.verify_spark_job()
89+
# Only need to do this once
90+
self.verify_spark_job_sql()
11691

11792
@parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"))
11893
def test_non_default_cuda_versions(self, configuration, machine_suffixes,
@@ -131,17 +106,19 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
131106
configuration,
132107
self.INIT_ACTIONS,
133108
metadata=metadata,
134-
machine_type="n1-standard-4",
109+
machine_type="n1-standard-32",
135110
master_accelerator=accelerator if configuration == "SINGLE" else None,
136111
worker_accelerator=accelerator,
137-
boot_disk_size="50GB",
112+
boot_disk_size="40GB",
138113
timeout_in_minutes=30)
139114

140115
for machine_suffix in machine_suffixes:
141116
self.verify_spark_instance("{}-{}".format(self.getClusterName(),
142117
machine_suffix))
143118
# Only need to do this once
144119
self.verify_spark_job()
120+
# Only need to do this once
121+
self.verify_spark_job_sql()
145122

146123
# Disable MIG related test due to the lack of A100 GPUs, more detail see
147124
# https://github.com/GoogleCloudDataproc/initialization-actions/pull/1070

templates/spark-rapids/mig.sh.in

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#!/bin/bash
2+
#
3+
[% INSERT legal/license_header %]
4+
#
5+
# This script installs NVIDIA GPU drivers and enables MIG on Hopper
6+
# GPU architectures.
7+
#
8+
# This script should be specified in --initialization-actions= option
9+
# and --metadata=ENABLE_MIG can be used to enable or disable MIG. The
10+
# default is to enable it. The script configures the MIG device based
11+
# on the user specified MIG_CGI profiles specified via:
12+
# --metadata=^:^MIG_CGI='9,9'. If MIG_CGI is not specified it assumes
13+
# it's using an H100 and configures 2 instances with profile id 9.
14+
#
15+
[% PROCESS common/template_disclaimer %]
16+
[% INSERT common/util_functions %]
17+
[% INSERT common/install_functions %]
18+
[% INSERT common/yarn_functions %]
19+
[% INSERT gpu/mig_functions %]
20+
[% INSERT gpu/util_functions %]
21+
[% INSERT gpu/install_functions %]
22+
[% INCLUDE gpu/yarn_functions %]
23+
[% INSERT gpu/spark_functions %]
24+
25+
set -euxo pipefail
26+
27+
function main() {
28+
if [[ "${nvsmi_works}" == "1" ]] ; then
29+
# if this is called without the MIG script then the drivers are not installed
30+
query_nvsmi
31+
local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
32+
set +e
33+
migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
34+
set -e
35+
NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
36+
37+
if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
38+
if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
39+
if (echo "${migquery_result}" | grep Enabled); then
40+
IS_MIG_ENABLED=1
41+
NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
42+
MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
43+
fetch_mig_scripts
44+
fi
45+
fi
46+
fi
47+
fi
48+
49+
# if mig is enabled drivers would have already been installed
50+
if [[ $IS_MIG_ENABLED -eq 0 ]]; then
51+
install_nvidia_gpu_driver
52+
install_cuda
53+
load_kernel_module
54+
55+
#Install GPU metrics collection in Stackdriver if needed
56+
if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
57+
install_gpu_agent
58+
# install_gpu_monitoring_agent
59+
echo 'GPU metrics agent successfully deployed.'
60+
else
61+
echo 'GPU metrics agent has not been installed.'
62+
fi
63+
configure_gpu_exclusive_mode
64+
fi
65+
66+
setup_gpu_yarn
67+
68+
echo "yarn setup complete"
69+
70+
enable_and_configure_mig
71+
72+
echo "main complete"
73+
return 0
74+
}
75+
76+
function exit_handler() {
77+
set +e
78+
gpu_install_exit_handler
79+
gpu_exit_handler
80+
pip_exit_handler
81+
yarn_exit_handler
82+
common_exit_handler
83+
return 0
84+
}
85+
86+
function prepare_to_install(){
87+
prepare_spark_env
88+
prepare_common_env
89+
prepare_pip_env
90+
prepare_gpu_env
91+
prepare_gpu_install_env
92+
trap exit_handler EXIT
93+
}
94+
95+
prepare_to_install
96+
97+
main
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/bin/bash
2+
#
3+
[% INSERT legal/license_header %]
4+
#
5+
#
6+
# This script installs NVIDIA GPU drivers.
7+
#
8+
# Dataproc 2.0: Driver version 530.30.02, CUDA version 12.1.1, Rapids 23.08.2
9+
# Dataproc 2.1: Driver version 550.135, CUDA version 12.4.1, Rapids 24.08.1
10+
# Dataproc 2.2: Driver version 560.35.03, CUDA version 12.6.2, Rapids 24.08.1
11+
#
12+
# Additionally, it installs the RAPIDS Spark plugin, configures Spark
13+
# and YARN, and installs an agent to collect GPU utilization metrics.
14+
# The installer is regularly exercised with Debian, Ubuntu, and Rocky
15+
# Linux distributions.
16+
#
17+
# Note that the script is designed to work both when secure boot is
18+
# enabled with a custom image and when disabled during cluster
19+
# creation.
20+
#
21+
# For details see
22+
# github.com/GoogleCloudDataproc/custom-images/tree/main/examples/secure-boot
23+
#
24+
25+
[% PROCESS common/template_disclaimer %]
26+
[% INSERT common/util_functions %]
27+
[% INSERT common/install_functions %]
28+
[% INSERT common/yarn_functions %]
29+
[% INSERT gpu/util_functions %]
30+
[% INSERT gpu/install_functions %]
31+
[% INCLUDE gpu/yarn_functions %]
32+
[% INSERT gpu/spark_functions %]
33+
34+
set -euxo pipefail
35+
36+
function main() {
37+
install_gpu_driver_and_cuda
38+
39+
#Install GPU metrics collection in Stackdriver if needed
40+
if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
41+
# install_gpu_agent
42+
install_gpu_monitoring_agent
43+
echo 'GPU metrics agent successfully deployed.'
44+
else
45+
echo 'GPU metrics agent has not been installed.'
46+
fi
47+
configure_gpu_exclusive_mode
48+
49+
setup_gpu_yarn
50+
51+
echo "yarn setup complete"
52+
53+
if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
54+
install_spark_rapids
55+
echo "RAPIDS initialized with Spark runtime"
56+
elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
57+
echo "This action only installs spark-rapids"
58+
exit 1
59+
else
60+
echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
61+
exit 1
62+
fi
63+
64+
echo "main complete"
65+
return 0
66+
}
67+
68+
function exit_handler() {
69+
set +e
70+
gpu_install_exit_handler
71+
gpu_exit_handler
72+
pip_exit_handler
73+
yarn_exit_handler
74+
common_exit_handler
75+
return 0
76+
}
77+
78+
function prepare_to_install(){
79+
prepare_spark_env
80+
prepare_common_env
81+
prepare_pip_env
82+
prepare_gpu_env
83+
prepare_gpu_install_env
84+
trap exit_handler EXIT
85+
}
86+
87+
prepare_to_install
88+
89+
main

0 commit comments

Comments
 (0)