1515#
1616# This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
1717
18- set -euxo pipefail
18+ set -euo pipefail
1919
20- function os_id() ( set +x ; grep ' ^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
21- function os_version() ( set +x ; grep ' ^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
22- function os_codename() ( set +x ; grep ' ^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
20+ function os_id() { grep ' ^ID=' /etc/os-release | cut -d= -f2 | xargs ; }
21+ function os_version() { grep ' ^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; }
22+ function os_codename() { grep ' ^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; }
2323
24- function version_ge() ( set +x ; [ " $1 " = " $( echo -e " $1 \n$2 " | sort -V | tail -n1) " ] ; )
25- function version_gt() ( set +x ; [ " $1 " = " $2 " ] && return 1 || version_ge $1 $2 ; )
26- function version_le() ( set +x ; [ " $1 " = " $( echo -e " $1 \n$2 " | sort -V | head -n1) " ] ; )
27- function version_lt() ( set +x ; [ " $1 " = " $2 " ] && return 1 || version_le $1 $2 ; )
24+ function version_ge(){ [[ " $1 " = " $( echo -e " $1 \n$2 " | sort -V| tail -n1) " ]]; }
25+ function version_gt(){ [[ " $1 " = " $2 " ]] && return 1 || version_ge " $1 " " $2 " ; }
26+ function version_le(){ [[ " $1 " = " $( echo -e " $1 \n$2 " | sort -V| head -n1) " ]]; }
27+ function version_lt(){ [[ " $1 " = " $2 " ]] && return 1 || version_le " $1 " " $2 " ; }
2828
2929readonly -A supported_os=(
3030 [' debian' ]=" 10 11 12"
@@ -34,24 +34,25 @@ readonly -A supported_os=(
3434
3535# dynamically define OS version test utility functions
3636if [[ " $( os_id) " == " rocky" ]];
37- then _os_version=$( os_version | sed -e ' s/[^0-9].*$//g' )
38- else _os_version=" $( os_version) " ; fi
37+ then _os_version=$( os_version | sed -e ' s/[^0-9].*$//g' )
38+ else _os_version=" $( os_version) "
39+ fi
3940for os_id_val in ' rocky' ' ubuntu' ' debian' ; do
40- eval " function is_${os_id_val} () ( set +x ; [[ \" $( os_id) \" == '${os_id_val} ' ]] ; ) "
41+ eval " function is_${os_id_val} () { [[ \" $( os_id) \" == '${os_id_val} ' ]] ; } "
4142
4243 for osver in $( echo " ${supported_os["${os_id_val}"]} " ) ; do
43- eval " function is_${os_id_val}${osver%% .* } () ( set +x ; is_${os_id_val} && [[ \" ${_os_version} \" == \" ${osver} \" ]] ; ) "
44- eval " function ge_${os_id_val}${osver%% .* } () ( set +x ; is_${os_id_val} && version_ge \" ${_os_version} \" \" ${osver} \" ; ) "
45- eval " function le_${os_id_val}${osver%% .* } () ( set +x ; is_${os_id_val} && version_le \" ${_os_version} \" \" ${osver} \" ; ) "
44+ eval " function is_${os_id_val}${osver%% .* } () { is_${os_id_val} && [[ \" ${_os_version} \" == \" ${osver} \" ]] ; } "
45+ eval " function ge_${os_id_val}${osver%% .* } () { is_${os_id_val} && version_ge \" ${_os_version} \" \" ${osver} \" ; } "
46+ eval " function le_${os_id_val}${osver%% .* } () { is_${os_id_val} && version_le \" ${_os_version} \" \" ${osver} \" ; } "
4647 done
4748done
4849
49- function is_debuntu() ( set +x ; is_debian || is_ubuntu ; )
50+ function is_debuntu() { is_debian || is_ubuntu ; }
5051
51- function os_vercat() ( set +x
52+ function os_vercat() {
5253 if is_ubuntu ; then os_version | sed -e ' s/[^0-9]//g'
5354 elif is_rocky ; then os_version | sed -e ' s/[^0-9].*$//g'
54- else os_version ; fi ; )
55+ else os_version ; fi ; }
5556
5657function repair_old_backports {
5758 if ! is_debuntu ; then return ; fi
@@ -96,8 +97,7 @@ function print_metadata_value_if_exists() {
9697}
9798
9899# replicates /usr/share/google/get_metadata_value
99- function get_metadata_value() (
100- set +x
100+ function get_metadata_value() {
101101 local readonly varname=$1
102102 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
103103 # Print the instance metadata value.
@@ -110,14 +110,13 @@ function get_metadata_value() (
110110 fi
111111
112112 return ${return_code}
113- )
113+ }
114114
115- function get_metadata_attribute() (
116- set +x
115+ function get_metadata_attribute() {
117116 local -r attribute_name=" $1 "
118117 local -r default_value=" ${2:- } "
119118 get_metadata_value " attributes/${attribute_name} " || echo -n " ${default_value} "
120- )
119+ }
121120
122121OS_NAME=" $( lsb_release -is | tr ' [:upper:]' ' [:lower:]' ) "
123122distribution=$( . /etc/os-release; echo $ID$VERSION_ID )
@@ -216,13 +215,13 @@ function set_cuda_version() {
216215 readonly CUDA_FULL_VERSION
217216}
218217
219- function is_cuda12() ( set +x ; [[ " ${CUDA_VERSION%% .* } " == " 12" ]] ; )
220- function le_cuda12() ( set +x ; version_le " ${CUDA_VERSION%% .* } " " 12" ; )
221- function ge_cuda12() ( set +x ; version_ge " ${CUDA_VERSION%% .* } " " 12" ; )
218+ function is_cuda12() { [[ " ${CUDA_VERSION%% .* } " == " 12" ]] ; }
219+ function le_cuda12() { version_le " ${CUDA_VERSION%% .* } " " 12" ; }
220+ function ge_cuda12() { version_ge " ${CUDA_VERSION%% .* } " " 12" ; }
222221
223- function is_cuda11() ( set +x ; [[ " ${CUDA_VERSION%% .* } " == " 11" ]] ; )
224- function le_cuda11() ( set +x ; version_le " ${CUDA_VERSION%% .* } " " 11" ; )
225- function ge_cuda11() ( set +x ; version_ge " ${CUDA_VERSION%% .* } " " 11" ; )
222+ function is_cuda11() { [[ " ${CUDA_VERSION%% .* } " == " 11" ]] ; }
223+ function le_cuda11() { version_le " ${CUDA_VERSION%% .* } " " 11" ; }
224+ function ge_cuda11() { version_ge " ${CUDA_VERSION%% .* } " " 11" ; }
226225
227226function set_driver_version() {
228227 local gpu_driver_url
@@ -463,17 +462,14 @@ MIG_MAJOR_CAPS=0
463462IS_MIG_ENABLED=0
464463
465464function execute_with_retries() (
466- set +x
467465 local -r cmd=" $* "
468466
469467 if [[ " $cmd " =~ " ^apt-get install" ]] ; then
470468 apt-get -y clean
471469 apt-get -o DPkg::Lock::Timeout=60 -y autoremove
472470 fi
473471 for (( i = 0 ; i < 3 ; i++ )) ; do
474- set -x
475472 time eval " $cmd " > " ${install_log} " 2>&1 && retval=$? || { retval=$? ; cat " ${install_log} " ; }
476- set +x
477473 if [[ $retval == 0 ]] ; then return 0 ; fi
478474 sleep 5
479475 done
@@ -718,8 +714,8 @@ function install_nvidia_nccl() {
718714 mark_complete nccl
719715}
720716
721- function is_src_nvidia() ( set +x ; [[ " ${GPU_DRIVER_PROVIDER} " == " NVIDIA" ]] ; )
722- function is_src_os() ( set +x ; [[ " ${GPU_DRIVER_PROVIDER} " == " OS" ]] ; )
717+ function is_src_nvidia() { [[ " ${GPU_DRIVER_PROVIDER} " == " NVIDIA" ]] ; }
718+ function is_src_os() { [[ " ${GPU_DRIVER_PROVIDER} " == " OS" ]] ; }
723719
724720function install_nvidia_cudnn() {
725721 is_complete cudnn && return
@@ -1576,13 +1572,21 @@ EOF
15761572 fi
15771573 local executor_cores
15781574 executor_cores=" $( nproc | perl -MPOSIX -pe ' $_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2' ) "
1575+ [[ " ${executor_cores} " == " 0" ]] && executor_cores=1
15791576 local executor_memory
15801577 executor_memory_gb=" $( awk ' /^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe ' $_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )' ) "
15811578 local task_cpus=2
1579+ [[ " ${task_cpus} " -gt " ${executor_cores} " ]] && task_cpus=" ${executor_cores} "
15821580 local gpu_amount
15831581# gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
15841582 gpu_amount=" $( perl -e " print 1 / ${executor_cores} " ) "
15851583
1584+ # the gpu.amount properties are not appropriate for the version of
1585+ # spark shipped with 1.5 images using the capacity scheduler. TODO:
1586+ # In order to get spark rapids GPU accelerated SQL working on 1.5
1587+ # images, we must configure the Fair scheduler
1588+ version_ge " ${DATAPROC_IMAGE_VERSION} " " 2.0" || return
1589+
15861590 # TODO: when running this script to customize an image, this file
15871591 # needs to be written *after* bdutil completes
15881592
@@ -2044,7 +2048,7 @@ function exit_handler() {
20442048 if ${gsutil_stat_cmd} " ${building_file} " ; then ${gsutil_cmd} rm " ${building_file} " || true ; fi
20452049 fi
20462050
2047- set +ex
2051+ set +e
20482052 echo " Exit handler invoked"
20492053
20502054 # Clear pip cache
0 commit comments