Skip to content

Commit 8089389

Browse files
nvliyuancjac
andauthored
update rapids version for 24.10 release (#1248)
* update v2410 rapids release Signed-off-by: liyuan <[email protected]> * update the readme doc Signed-off-by: liyuan <[email protected]> * update the readme doc Signed-off-by: liyuan <[email protected]> * update v2412 version Signed-off-by: liyuan <[email protected]> * do not recreate git clone on second pass * gathering timing data for some long-running sections of the installer * over-commitment on the disk space cleaned up --------- Signed-off-by: liyuan <[email protected]> Co-authored-by: C.J. Collier <[email protected]>
1 parent 169e98e commit 8089389

File tree

3 files changed

+14
-25
lines changed

3 files changed

+14
-25
lines changed

spark-rapids/README.md

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,8 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+.
1717
## RAPIDS Accelerator For Apache Spark
1818

1919
### Prerequisites
20-
21-
To use RAPIDS Accelerator For Apache Spark, XGBoost4j with Spark 3
22-
23-
* Apache Spark 3.0+
24-
* Hardware Requirements
25-
* NVIDIA Pascal™ GPU architecture or better (V100, P100, T4 and later)
26-
* Multi-node clusters with homogenous GPU configuration
27-
* Software Requirements
28-
* NVIDIA GPU driver 440.33+
29-
* CUDA v11.5/v11.0/v10.2/v10.1
30-
* NCCL 2.11.4+
31-
* Ubuntu 18.04, Ubuntu 20.04 or Rocky Linux 7, Rocky Linux8, Debian 10, Debian 11
20+
Please find the [RAPIDS Accelerator For Apache Spark](https://nvidia.github.io/spark-rapids/)
21+
official document for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html).
3222

3323
This section describes how to create
3424
[Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with
@@ -59,20 +49,17 @@ export GCS_BUCKET=<your bucket for the logs and notebooks>
5949
export REGION=<region>
6050
export NUM_GPUS=1
6151
export NUM_WORKERS=2
62-
export CUDA_VER=11.5
6352

6453
gcloud dataproc clusters create $CLUSTER_NAME \
6554
--region $REGION \
66-
--image-version=2.0-ubuntu18 \
55+
--image-version=2.2-ubuntu22 \
6756
--master-machine-type n1-standard-4 \
6857
--master-boot-disk-size 200 \
6958
--num-workers $NUM_WORKERS \
7059
--worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
7160
--worker-machine-type n1-standard-8 \
7261
--num-worker-local-ssds 1 \
7362
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
74-
--optional-components=JUPYTER,ZEPPELIN \
75-
--metadata gpu-driver-provider="NVIDIA",rapids-runtime="SPARK",cuda-version="$CUDA_VER" \
7663
--bucket $GCS_BUCKET \
7764
--subnet=default \
7865
--enable-component-gateway

spark-rapids/spark-rapids.sh

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ else
216216
fi
217217

218218
# Update SPARK RAPIDS config
219-
readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
219+
readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0"
220220
readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
221221
readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
222222

@@ -261,7 +261,7 @@ IS_MIG_ENABLED=0
261261
function execute_with_retries() {
262262
local -r cmd=$1
263263
for ((i = 0; i < 10; i++)); do
264-
if eval "$cmd"; then
264+
if time eval "$cmd"; then
265265
return 0
266266
fi
267267
sleep 5
@@ -418,8 +418,9 @@ function install_nvidia_gpu_driver() {
418418
mkdir -p "${WORKDIR}"
419419
pushd $_
420420
# Fetch open souce kernel module with corresponding tag
421-
git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
422-
--branch "${NVIDIA_DRIVER_VERSION}" --single-branch
421+
test -d open-gpu-kernel-modules || \
422+
git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
423+
--branch "${NVIDIA_DRIVER_VERSION}" --single-branch
423424
cd ${WORKDIR}/open-gpu-kernel-modules
424425
#
425426
# build kernel modules
@@ -451,7 +452,7 @@ function install_nvidia_gpu_driver() {
451452
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
452453
"https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \
453454
-o cuda.run
454-
bash cuda.run --silent --toolkit --no-opengl-libs
455+
time bash cuda.run --silent --toolkit --no-opengl-libs
455456
rm cuda.run
456457
else
457458
# Install from repo provided by NV
@@ -525,7 +526,8 @@ function download_agent(){
525526
mkdir -p /opt/google
526527
chmod 777 /opt/google
527528
cd /opt/google
528-
execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
529+
test -d compute-gpu-monitoring || \
530+
execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
529531
}
530532

531533
function install_agent_dependency(){

spark-rapids/test_spark_rapids.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
7575
machine_type="n1-standard-4",
7676
master_accelerator=accelerator if configuration == "SINGLE" else None,
7777
worker_accelerator=accelerator,
78-
boot_disk_size="1024GB",
78+
boot_disk_size="50GB",
7979
timeout_in_minutes=30)
8080

8181
for machine_suffix in machine_suffixes:
@@ -105,7 +105,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
105105
machine_type="n1-standard-4",
106106
master_accelerator=accelerator if configuration == "SINGLE" else None,
107107
worker_accelerator=accelerator,
108-
boot_disk_size="1024GB",
108+
boot_disk_size="50GB",
109109
timeout_in_minutes=30)
110110

111111
for machine_suffix in machine_suffixes:
@@ -134,7 +134,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
134134
machine_type="n1-standard-4",
135135
master_accelerator=accelerator if configuration == "SINGLE" else None,
136136
worker_accelerator=accelerator,
137-
boot_disk_size="1024GB",
137+
boot_disk_size="50GB",
138138
timeout_in_minutes=30)
139139

140140
for machine_suffix in machine_suffixes:

0 commit comments

Comments
 (0)