[spark-rapids] generate actions from templates

cjac · cjac · commit 3ca9c84ac5f6 · 2025-01-09T19:25:34.000-08:00
diff --git a/spark-rapids/test_mig.py b/spark-rapids/test_mig.py
@@ -0,0 +1,48 @@
+import os
+import time
+
+import pkg_resources
+from absl.testing import absltest
+from absl.testing import parameterized
+
+from integration_tests.dataproc_test_case import DataprocTestCase
+
+class MigTestCase(DataprocTestCase):
+  COMPONENT = "rapids"
+  INIT_ACTIONS = ["spark-rapids/mig.sh"]
+
+  GPU_H100 = "type=nvidia-h100-80gb,count=2"
+  default_machine_type = "n1-standard-32"
+
+  def verify_mig_instance(self, name):
+    self.assert_instance_command(name,
+        "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
+
+  @parameterized.parameters( ("SINGLE", ["m"], GPU_H100, None, "NVIDIA", "us-central1-c")
+#                             ("STANDARD", ["m", "w-0", "w-1"], None, self.GPU_H100, "NVIDIA", "us-central1-c")
+#                             ("KERBEROS", ["m", "w-0", "w-1"], None, self.GPU_H100, "NVIDIA", "us-central1-c")
+                            )
+  def test_install_gpu_with_mig(self, configuration, machine_suffixes,
+                                  master_accelerator, worker_accelerator,
+                                  driver_provider, zone):
+
+    if configuration == 'SINGLE' and master_accelerator == None:
+      master_accelerator=self.GPU_H100
+
+    self.createCluster(
+        configuration,
+        self.INIT_ACTIONS,
+        zone=zone,
+        master_machine_type="a3-highgpu-2g" if master_accelerator == self.GPU_H100 else self.default_machine_type,
+        worker_machine_type="a3-highgpu-2g" if worker_accelerator == self.GPU_H100 else self.default_machine_type,
+        master_accelerator=master_accelerator,
+        worker_accelerator=worker_accelerator,
+        metadata=None,
+        timeout_in_minutes=30,
+        network_interface="nic-type=GVNIC,address=,network=default",
+        boot_disk_size="40GB",
+        )
+
+    for machine_suffix in machine_suffixes:
+      self.verify_mig_instance("{}-{}".format(self.getClusterName(),
+                                          machine_suffix))
diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
@@ -1,4 +1,5 @@
 import os
+import time
 
 import pkg_resources
 from absl.testing import absltest
@@ -11,31 +12,27 @@ class SparkRapidsTestCase(DataprocTestCase):
   INIT_ACTIONS = ["spark-rapids/spark-rapids.sh"]
 
   GPU_T4 = "type=nvidia-tesla-t4"
-  GPU_A100 = "type=nvidia-tesla-a100"
+  default_machine_type = "n1-highmem-8"
 
   # Tests for RAPIDS init action
   XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark_rapids.scala"
   XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME = "verify_xgboost_spark_rapids_sql.scala"
+  cmd_template="""echo :quit | spark-shell \
+         --conf spark.executor.resource.gpu.amount=1 \
+         --conf spark.task.resource.gpu.amount=1 \
+         --conf spark.dynamicAllocation.enabled=false -i {}"""
 
   def verify_spark_instance(self, name):
     self.assert_instance_command(name, "nvidia-smi")
 
-  def verify_mig_instance(self, name):
-    self.assert_instance_command(name,
-        "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
-
   def verify_spark_job(self):
     instance_name = "{}-m".format(self.getClusterName())
     self.upload_test_file(
         os.path.join(
             os.path.dirname(os.path.abspath(__file__)),
             self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME), instance_name)
     self.assert_instance_command(
-        instance_name, """echo :quit | spark-shell \
-         --conf spark.executor.resource.gpu.amount=1 \
-         --conf spark.task.resource.gpu.amount=1 \
-         --conf spark.dynamicAllocation.enabled=false -i {}""".format(
-             self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME))
+        instance_name, self.cmd_template.format(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME))
     self.remove_test_script(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME,
                             instance_name)
 
@@ -46,24 +43,14 @@ def verify_spark_job_sql(self):
         os.path.dirname(os.path.abspath(__file__)),
         self.XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME), instance_name)
     self.assert_instance_command(
-      instance_name, """echo :quit | spark-shell \
-         --conf spark.executor.resource.gpu.amount=1 \
-         --conf spark.task.resource.gpu.amount=1 \
-         --conf spark.dynamicAllocation.enabled=false -i {}""".format(
-        self.XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME))
+        instance_name, self.cmd_template.format(self.XGBOOST_SPARK_TEST_SCRIPT_FILE_NAME))
     self.remove_test_script(self.XGBOOST_SPARK_SQL_TEST_SCRIPT_FILE_NAME,
                             instance_name)
 
   @parameterized.parameters(("SINGLE", ["m"], GPU_T4),
                             ("STANDARD", ["w-0"], GPU_T4))
   def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
 
-    if self.getImageOs() == "rocky":
-      self.skipTest("Not supported for Rocky OS")
-
-    if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
-      self.skipTest("Not supported in 2.0 and earlier images")
-
     optional_components = None
     metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
 
@@ -72,106 +59,50 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
         self.INIT_ACTIONS,
         optional_components=optional_components,
         metadata=metadata,
-        machine_type="n1-standard-4",
+        machine_type=self.default_machine_type,
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="50GB",
+        boot_disk_size="40GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
       self.verify_spark_instance("{}-{}".format(self.getClusterName(),
                                                 machine_suffix))
-    # Only need to do this once
-    self.verify_spark_job()
-
-  @parameterized.parameters(("SINGLE", ["m"], GPU_T4),
-                            ("STANDARD", ["w-0"], GPU_T4))
-  def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
 
-    if self.getImageOs() == "rocky":
-      self.skipTest("Not supported for Rocky OS")
-
-    if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
-      self.skipTest("Not supported in 2.0 and earlier images")
-
-    optional_components = None
-    metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
-
-    self.createCluster(
-      configuration,
-      self.INIT_ACTIONS,
-      optional_components=optional_components,
-      metadata=metadata,
-      machine_type="n1-standard-4",
-      master_accelerator=accelerator if configuration == "SINGLE" else None,
-      worker_accelerator=accelerator,
-      boot_disk_size="50GB",
-      timeout_in_minutes=30)
-
-    for machine_suffix in machine_suffixes:
-      self.verify_spark_instance("{}-{}".format(self.getClusterName(),
-                                                machine_suffix))
-    # Only need to do this once
-    self.verify_spark_job_sql()
+    if ( self.getImageOs() == 'rocky' ) \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
+    and configuration == 'SINGLE':
+      print("skipping spark job test ; 2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail")
+    else:
+      # Only need to do this once
+      self.verify_spark_job()
+      # Only need to do this once
+      self.verify_spark_job_sql()
 
   @parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"))
   def test_non_default_cuda_versions(self, configuration, machine_suffixes,
                                      accelerator, cuda_version, driver_version):
 
-    if self.getImageOs() == "rocky":
-      self.skipTest("Not supported for Rocky OS")
-
-    if self.getImageVersion() <= pkg_resources.parse_version("2.0"):
-      self.skipTest("Not supported in 2.0 and earlier images")
-
     metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
                 ",cuda-version={0},driver-version={1}".format(cuda_version, driver_version))
 
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
         metadata=metadata,
-        machine_type="n1-standard-4",
+        machine_type="n1-standard-32",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="50GB",
+        boot_disk_size="40GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
       self.verify_spark_instance("{}-{}".format(self.getClusterName(),
                                                 machine_suffix))
     # Only need to do this once
     self.verify_spark_job()
-
-  # Disable MIG related test due to the lack of A100 GPUs, more detail see
-  # https://github.com/GoogleCloudDataproc/initialization-actions/pull/1070
-
-  # @parameterized.parameters(("STANDARD", ["m", "w-0", "w-1"], None, GPU_A100, "NVIDIA", "us-central1-c"))
-  # def test_install_gpu_with_mig(self, configuration, machine_suffixes,
-  #                                 master_accelerator, worker_accelerator,
-  #                                 driver_provider, zone):
-  #   if self.getImageVersion() < pkg_resources.parse_version("2.0") or self.getImageOs() == "rocky":
-  #     self.skipTest("Not supported in pre 2.0 or Rocky images")
-  # 
-  #   if self.getImageVersion() == pkg_resources.parse_version("2.1"):
-  #     self.skipTest("Not supported in 2.1 images")
-  # 
-  #   self.createCluster(
-  #       configuration,
-  #       self.INIT_ACTIONS,
-  #       zone=zone,
-  #       master_machine_type="n1-standard-4",
-  #       worker_machine_type="a2-highgpu-1g",
-  #       master_accelerator=master_accelerator,
-  #       worker_accelerator=worker_accelerator,
-  #       metadata=None,
-  #       timeout_in_minutes=30,
-  #       boot_disk_size="200GB",
-  #       startup_script="spark-rapids/mig.sh")
-  # 
-  #   for machine_suffix in ["w-0", "w-1"]:
-  #     self.verify_mig_instance("{}-{}".format(self.getClusterName(),
-  #                                         machine_suffix))
+    # Only need to do this once
+    self.verify_spark_job_sql()
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/templates/spark-rapids/mig.sh.in b/templates/spark-rapids/mig.sh.in
@@ -0,0 +1,97 @@
+#!/bin/bash
+#
+[% INSERT legal/license_header %]
+#
+# This script installs NVIDIA GPU drivers and enables MIG on Hopper
+# GPU architectures.
+#
+# This script should be specified in --initialization-actions= option
+# and --metadata=ENABLE_MIG can be used to enable or disable MIG. The
+# default is to enable it.  The script configures the MIG device based
+# on the user specified MIG_CGI profiles specified via:
+# --metadata=^:^MIG_CGI='9,9'. If MIG_CGI is not specified it assumes
+# it's using an H100 and configures 2 instances with profile id 9.
+#
+[% PROCESS common/template_disclaimer %]
+[% INSERT  common/util_functions %]
+[% INSERT  common/install_functions %]
+[% INSERT  common/yarn_functions %]
+[% INSERT  gpu/mig_functions %]
+[% INSERT  gpu/util_functions %]
+[% INSERT  gpu/install_functions %]
+[% INCLUDE gpu/yarn_functions %]
+[% INSERT  gpu/spark_functions %]
+
+set -euxo pipefail
+
+function main() {
+  if [[ "${nvsmi_works}" == "1" ]] ; then
+    # if this is called without the MIG script then the drivers are not installed
+    query_nvsmi
+    local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
+    set +e
+    migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
+    set -e
+    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
+
+    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
+      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
+        if (echo "${migquery_result}" | grep Enabled); then
+          IS_MIG_ENABLED=1
+          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
+          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
+          fetch_mig_scripts
+        fi
+      fi
+    fi
+  fi
+
+  # if mig is enabled drivers would have already been installed
+  if [[ $IS_MIG_ENABLED -eq 0 ]]; then
+    install_nvidia_gpu_driver
+    install_cuda
+    load_kernel_module
+
+    #Install GPU metrics collection in Stackdriver if needed
+    if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+      install_gpu_agent
+#      install_gpu_monitoring_agent
+      echo 'GPU metrics agent successfully deployed.'
+    else
+      echo 'GPU metrics agent has not been installed.'
+    fi
+    configure_gpu_exclusive_mode
+  fi
+
+  setup_gpu_yarn
+
+  echo "yarn setup complete"
+
+  enable_and_configure_mig
+
+  echo "main complete"
+  return 0
+}
+
+function exit_handler() {
+  set +e
+  gpu_install_exit_handler
+  gpu_exit_handler
+  pip_exit_handler
+  yarn_exit_handler
+  common_exit_handler
+  return 0
+}
+
+function prepare_to_install(){
+  prepare_spark_env
+  prepare_common_env
+  prepare_pip_env
+  prepare_gpu_env
+  prepare_gpu_install_env
+  trap exit_handler EXIT
+}
+
+prepare_to_install
+
+main
diff --git a/templates/spark-rapids/spark-rapids.sh.in b/templates/spark-rapids/spark-rapids.sh.in