11import os
2+ import time
23
34import pkg_resources
45from absl .testing import absltest
@@ -11,31 +12,27 @@ class SparkRapidsTestCase(DataprocTestCase):
1112 INIT_ACTIONS = ["spark-rapids/spark-rapids.sh" ]
1213
1314 GPU_T4 = "type=nvidia-tesla-t4"
14- GPU_A100 = "type=nvidia-tesla-a100 "
15+ default_machine_type = "n1-highmem-8 "
1516
1617 # Tests for RAPIDS init action
1718 XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark_rapids.scala"
1819 XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark_rapids_sql.scala"
20+ cmd_template = """echo :quit | spark-shell \
21+ --conf spark.executor.resource.gpu.amount=1 \
22+ --conf spark.task.resource.gpu.amount=1 \
23+ --conf spark.dynamicAllocation.enabled=false -i {}"""
1924
2025 def verify_spark_instance (self , name ):
2126 self .assert_instance_command (name , "nvidia-smi" )
2227
23- def verify_mig_instance (self , name ):
24- self .assert_instance_command (name ,
25- "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'" )
26-
2728 def verify_spark_job (self ):
2829 instance_name = "{}-m" .format (self .getClusterName ())
2930 self .upload_test_file (
3031 os .path .join (
3132 os .path .dirname (os .path .abspath (__file__ )),
3233 self .XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME ), instance_name )
3334 self .assert_instance_command (
34- instance_name , """echo :quit | spark-shell \
35- --conf spark.executor.resource.gpu.amount=1 \
36- --conf spark.task.resource.gpu.amount=1 \
37- --conf spark.dynamicAllocation.enabled=false -i {}""" .format (
38- self .XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME ))
35+ instance_name , self .cmd_template .format (self .XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME ))
3936 self .remove_test_script (self .XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME ,
4037 instance_name )
4138
@@ -46,24 +43,14 @@ def verify_spark_job_sql(self):
4643 os .path .dirname (os .path .abspath (__file__ )),
4744 self .XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME ), instance_name )
4845 self .assert_instance_command (
49- instance_name , """echo :quit | spark-shell \
50- --conf spark.executor.resource.gpu.amount=1 \
51- --conf spark.task.resource.gpu.amount=1 \
52- --conf spark.dynamicAllocation.enabled=false -i {}""" .format (
53- self .XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME ))
46+ instance_name , self .cmd_template .format (self .XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME ))
5447 self .remove_test_script (self .XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME ,
5548 instance_name )
5649
5750 @parameterized .parameters (("SINGLE" , ["m" ], GPU_T4 ),
5851 ("STANDARD" , ["w-0" ], GPU_T4 ))
5952 def test_spark_rapids (self , configuration , machine_suffixes , accelerator ):
6053
61- if self .getImageOs () == "rocky" :
62- self .skipTest ("Not supported for Rocky OS" )
63-
64- if self .getImageVersion () <= pkg_resources .parse_version ("2.0" ):
65- self .skipTest ("Not supported in 2.0 and earlier images" )
66-
6754 optional_components = None
6855 metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
6956
@@ -72,106 +59,50 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
7259 self .INIT_ACTIONS ,
7360 optional_components = optional_components ,
7461 metadata = metadata ,
75- machine_type = "n1-standard-4" ,
62+ machine_type = self . default_machine_type ,
7663 master_accelerator = accelerator if configuration == "SINGLE" else None ,
7764 worker_accelerator = accelerator ,
78- boot_disk_size = "50GB " ,
65+ boot_disk_size = "40GB " ,
7966 timeout_in_minutes = 30 )
8067
8168 for machine_suffix in machine_suffixes :
8269 self .verify_spark_instance ("{}-{}" .format (self .getClusterName (),
8370 machine_suffix ))
84- # Only need to do this once
85- self .verify_spark_job ()
86-
87- @parameterized .parameters (("SINGLE" , ["m" ], GPU_T4 ),
88- ("STANDARD" , ["w-0" ], GPU_T4 ))
89- def test_spark_rapids_sql (self , configuration , machine_suffixes , accelerator ):
9071
91- if self .getImageOs () == "rocky" :
92- self .skipTest ("Not supported for Rocky OS" )
93-
94- if self .getImageVersion () <= pkg_resources .parse_version ("2.0" ):
95- self .skipTest ("Not supported in 2.0 and earlier images" )
96-
97- optional_components = None
98- metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
99-
100- self .createCluster (
101- configuration ,
102- self .INIT_ACTIONS ,
103- optional_components = optional_components ,
104- metadata = metadata ,
105- machine_type = "n1-standard-4" ,
106- master_accelerator = accelerator if configuration == "SINGLE" else None ,
107- worker_accelerator = accelerator ,
108- boot_disk_size = "50GB" ,
109- timeout_in_minutes = 30 )
110-
111- for machine_suffix in machine_suffixes :
112- self .verify_spark_instance ("{}-{}" .format (self .getClusterName (),
113- machine_suffix ))
114- # Only need to do this once
115- self .verify_spark_job_sql ()
72+ if ( self .getImageOs () == 'rocky' ) \
73+ and self .getImageVersion () <= pkg_resources .parse_version ("2.1" ) \
74+ and configuration == 'SINGLE' :
75+ print ("skipping spark job test ; 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail" )
76+ else :
77+ # Only need to do this once
78+ self .verify_spark_job ()
79+ # Only need to do this once
80+ self .verify_spark_job_sql ()
11681
11782 @parameterized .parameters (("STANDARD" , ["w-0" ], GPU_T4 , "12.4.0" , "550.54.14" ))
11883 def test_non_default_cuda_versions (self , configuration , machine_suffixes ,
11984 accelerator , cuda_version , driver_version ):
12085
121- if self .getImageOs () == "rocky" :
122- self .skipTest ("Not supported for Rocky OS" )
123-
124- if self .getImageVersion () <= pkg_resources .parse_version ("2.0" ):
125- self .skipTest ("Not supported in 2.0 and earlier images" )
126-
12786 metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
12887 ",cuda-version={0},driver-version={1}" .format (cuda_version , driver_version ))
12988
13089 self .createCluster (
13190 configuration ,
13291 self .INIT_ACTIONS ,
13392 metadata = metadata ,
134- machine_type = "n1-standard-4 " ,
93+ machine_type = "n1-standard-32 " ,
13594 master_accelerator = accelerator if configuration == "SINGLE" else None ,
13695 worker_accelerator = accelerator ,
137- boot_disk_size = "50GB " ,
96+ boot_disk_size = "40GB " ,
13897 timeout_in_minutes = 30 )
13998
14099 for machine_suffix in machine_suffixes :
141100 self .verify_spark_instance ("{}-{}" .format (self .getClusterName (),
142101 machine_suffix ))
143102 # Only need to do this once
144103 self .verify_spark_job ()
145-
146- # Disable MIG related test due to the lack of A100 GPUs, more detail see
147- # https://github.com/GoogleCloudDataproc/initialization-actions/pull/1070
148-
149- # @parameterized.parameters(("STANDARD", ["m", "w-0", "w-1"], None, GPU_A100, "NVIDIA", "us-central1-c"))
150- # def test_install_gpu_with_mig(self, configuration, machine_suffixes,
151- # master_accelerator, worker_accelerator,
152- # driver_provider, zone):
153- # if self.getImageVersion() < pkg_resources.parse_version("2.0") or self.getImageOs() == "rocky":
154- # self.skipTest("Not supported in pre 2.0 or Rocky images")
155- #
156- # if self.getImageVersion() == pkg_resources.parse_version("2.1"):
157- # self.skipTest("Not supported in 2.1 images")
158- #
159- # self.createCluster(
160- # configuration,
161- # self.INIT_ACTIONS,
162- # zone=zone,
163- # master_machine_type="n1-standard-4",
164- # worker_machine_type="a2-highgpu-1g",
165- # master_accelerator=master_accelerator,
166- # worker_accelerator=worker_accelerator,
167- # metadata=None,
168- # timeout_in_minutes=30,
169- # boot_disk_size="200GB",
170- # startup_script="spark-rapids/mig.sh")
171- #
172- # for machine_suffix in ["w-0", "w-1"]:
173- # self.verify_mig_instance("{}-{}".format(self.getClusterName(),
174- # machine_suffix))
104+ # Only need to do this once
105+ self .verify_spark_job_sql ()
175106
176107if __name__ == "__main__" :
177108 absltest .main ()
0 commit comments