update rapids version for 24.10 release (#1248)

nvliyuan · cjac · web-flow · commit 808938917e43 · 2024-12-24T20:57:47.000-08:00
* update v2410 rapids release

Signed-off-by: liyuan &lt;yuali@nvidia.com&gt;

* update the readme doc

Signed-off-by: liyuan &lt;yuali@nvidia.com&gt;

* update the readme doc

Signed-off-by: liyuan &lt;yuali@nvidia.com&gt;

* update v2412 version

Signed-off-by: liyuan &lt;yuali@nvidia.com&gt;

* do not recreate git clone on second pass

* gathering timing data for some long-running sections of the installer

* over-commitment on the disk space cleaned up


---------

Signed-off-by: liyuan &lt;yuali@nvidia.com&gt;
Co-authored-by: C.J. Collier &lt;cjac@google.com&gt;
diff --git a/spark-rapids/README.md b/spark-rapids/README.md
@@ -17,18 +17,8 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+.
 ## RAPIDS Accelerator For Apache Spark
 
 ### Prerequisites
-
-To use RAPIDS Accelerator For Apache Spark, XGBoost4j with Spark 3
-
-*   Apache Spark 3.0+
-*   Hardware Requirements
-    *   NVIDIA Pascal™ GPU architecture or better (V100, P100, T4 and later)
-    *   Multi-node clusters with homogenous GPU configuration
-*   Software Requirements
-    *   NVIDIA GPU driver 440.33+
-    *   CUDA v11.5/v11.0/v10.2/v10.1
-    *   NCCL 2.11.4+
-    *   Ubuntu 18.04, Ubuntu 20.04 or Rocky Linux 7, Rocky Linux8, Debian 10, Debian 11
+Please find the [RAPIDS Accelerator For Apache Spark](https://nvidia.github.io/spark-rapids/) 
+official document for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html).
 
 This section describes how to create
 [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with
@@ -59,20 +49,17 @@ export GCS_BUCKET=<your bucket for the logs and notebooks>
 export REGION=<region>
 export NUM_GPUS=1
 export NUM_WORKERS=2
-export CUDA_VER=11.5
 
 gcloud dataproc clusters create $CLUSTER_NAME  \
     --region $REGION \
-    --image-version=2.0-ubuntu18 \
+    --image-version=2.2-ubuntu22 \
     --master-machine-type n1-standard-4 \
     --master-boot-disk-size 200 \
     --num-workers $NUM_WORKERS \
     --worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
     --worker-machine-type n1-standard-8 \
     --num-worker-local-ssds 1 \
     --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
-    --optional-components=JUPYTER,ZEPPELIN \
-    --metadata gpu-driver-provider="NVIDIA",rapids-runtime="SPARK",cuda-version="$CUDA_VER" \
     --bucket $GCS_BUCKET \
     --subnet=default \
     --enable-component-gateway
diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh
@@ -216,7 +216,7 @@ else
 fi
 
 # Update SPARK RAPIDS config
-readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0"
 readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
 readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 
@@ -261,7 +261,7 @@ IS_MIG_ENABLED=0
 function execute_with_retries() {
   local -r cmd=$1
   for ((i = 0; i < 10; i++)); do
-    if eval "$cmd"; then
+    if time eval "$cmd"; then
       return 0
     fi
     sleep 5
@@ -418,8 +418,9 @@ function install_nvidia_gpu_driver() {
       mkdir -p "${WORKDIR}"
       pushd $_
       # Fetch open souce kernel module with corresponding tag
-      git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
-          --branch "${NVIDIA_DRIVER_VERSION}" --single-branch
+      test -d open-gpu-kernel-modules || \
+	 git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
+            --branch "${NVIDIA_DRIVER_VERSION}" --single-branch
       cd ${WORKDIR}/open-gpu-kernel-modules
       #
       # build kernel modules
@@ -451,7 +452,7 @@ function install_nvidia_gpu_driver() {
       curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
        "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \
        -o cuda.run
-      bash cuda.run --silent --toolkit --no-opengl-libs
+      time bash cuda.run --silent --toolkit --no-opengl-libs
       rm cuda.run
     else
       # Install from repo provided by NV
@@ -525,7 +526,8 @@ function download_agent(){
   mkdir -p /opt/google
   chmod 777 /opt/google
   cd /opt/google
-  execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
+  test -d compute-gpu-monitoring || \
+    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
 }
 
 function install_agent_dependency(){
diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py
@@ -75,7 +75,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
         machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="1024GB",
+        boot_disk_size="50GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
@@ -105,7 +105,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
       machine_type="n1-standard-4",
       master_accelerator=accelerator if configuration == "SINGLE" else None,
       worker_accelerator=accelerator,
-      boot_disk_size="1024GB",
+      boot_disk_size="50GB",
       timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes:
@@ -134,7 +134,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
         machine_type="n1-standard-4",
         master_accelerator=accelerator if configuration == "SINGLE" else None,
         worker_accelerator=accelerator,
-        boot_disk_size="1024GB",
+        boot_disk_size="50GB",
         timeout_in_minutes=30)
 
     for machine_suffix in machine_suffixes: