Skip to content

Commit 57cbfc6

Browse files
authored
[gpu] re-include tests for running without GPU #1315
* revert function-as-subshell experiment intended to reduce debug noise * validate and correct invalid values for executor_cores and task_cpus * re-include tests for running the script with no GPU hardware
1 parent 1d9ec7f commit 57cbfc6

File tree

2 files changed

+39
-37
lines changed

2 files changed

+39
-37
lines changed

gpu/install_gpu_driver.sh

Lines changed: 39 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,16 @@
1515
#
1616
# This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
1717

18-
set -euxo pipefail
18+
set -euo pipefail
1919

20-
function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
21-
function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
22-
function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
20+
function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; }
21+
function os_version() { grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; }
22+
function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; }
2323

24-
function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
25-
function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
26-
function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
27-
function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
24+
function version_ge(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|tail -n1)" ]]; }
25+
function version_gt(){ [[ "$1" = "$2" ]]&& return 1 || version_ge "$1" "$2";}
26+
function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }
27+
function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}
2828

2929
readonly -A supported_os=(
3030
['debian']="10 11 12"
@@ -34,24 +34,25 @@ readonly -A supported_os=(
3434

3535
# dynamically define OS version test utility functions
3636
if [[ "$(os_id)" == "rocky" ]];
37-
then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
38-
else _os_version="$(os_version)"; fi
37+
then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
38+
else _os_version="$(os_version)"
39+
fi
3940
for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
40-
eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
41+
eval "function is_${os_id_val}() { [[ \"$(os_id)\" == '${os_id_val}' ]] ; }"
4142

4243
for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
43-
eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
44-
eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
45-
eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
44+
eval "function is_${os_id_val}${osver%%.*}() { is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; }"
45+
eval "function ge_${os_id_val}${osver%%.*}() { is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; }"
46+
eval "function le_${os_id_val}${osver%%.*}() { is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; }"
4647
done
4748
done
4849

49-
function is_debuntu() ( set +x ; is_debian || is_ubuntu ; )
50+
function is_debuntu() { is_debian || is_ubuntu ; }
5051

51-
function os_vercat() ( set +x
52+
function os_vercat() {
5253
if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
5354
elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g'
54-
else os_version ; fi ; )
55+
else os_version ; fi ; }
5556

5657
function repair_old_backports {
5758
if ! is_debuntu ; then return ; fi
@@ -96,8 +97,7 @@ function print_metadata_value_if_exists() {
9697
}
9798

9899
# replicates /usr/share/google/get_metadata_value
99-
function get_metadata_value() (
100-
set +x
100+
function get_metadata_value() {
101101
local readonly varname=$1
102102
local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
103103
# Print the instance metadata value.
@@ -110,14 +110,13 @@ function get_metadata_value() (
110110
fi
111111

112112
return ${return_code}
113-
)
113+
}
114114

115-
function get_metadata_attribute() (
116-
set +x
115+
function get_metadata_attribute() {
117116
local -r attribute_name="$1"
118117
local -r default_value="${2:-}"
119118
get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
120-
)
119+
}
121120

122121
OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
123122
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
@@ -216,13 +215,13 @@ function set_cuda_version() {
216215
readonly CUDA_FULL_VERSION
217216
}
218217

219-
function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
220-
function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
221-
function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
218+
function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; }
219+
function le_cuda12() { version_le "${CUDA_VERSION%%.*}" "12" ; }
220+
function ge_cuda12() { version_ge "${CUDA_VERSION%%.*}" "12" ; }
222221

223-
function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
224-
function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
225-
function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
222+
function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; }
223+
function le_cuda11() { version_le "${CUDA_VERSION%%.*}" "11" ; }
224+
function ge_cuda11() { version_ge "${CUDA_VERSION%%.*}" "11" ; }
226225

227226
function set_driver_version() {
228227
local gpu_driver_url
@@ -463,17 +462,14 @@ MIG_MAJOR_CAPS=0
463462
IS_MIG_ENABLED=0
464463

465464
function execute_with_retries() (
466-
set +x
467465
local -r cmd="$*"
468466

469467
if [[ "$cmd" =~ "^apt-get install" ]] ; then
470468
apt-get -y clean
471469
apt-get -o DPkg::Lock::Timeout=60 -y autoremove
472470
fi
473471
for ((i = 0; i < 3; i++)); do
474-
set -x
475472
time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
476-
set +x
477473
if [[ $retval == 0 ]] ; then return 0 ; fi
478474
sleep 5
479475
done
@@ -718,8 +714,8 @@ function install_nvidia_nccl() {
718714
mark_complete nccl
719715
}
720716

721-
function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
722-
function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
717+
function is_src_nvidia() { [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; }
718+
function is_src_os() { [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; }
723719

724720
function install_nvidia_cudnn() {
725721
is_complete cudnn && return
@@ -1576,13 +1572,21 @@ EOF
15761572
fi
15771573
local executor_cores
15781574
executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
1575+
[[ "${executor_cores}" == "0" ]] && executor_cores=1
15791576
local executor_memory
15801577
executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
15811578
local task_cpus=2
1579+
[[ "${task_cpus}" -gt "${executor_cores}" ]] && task_cpus="${executor_cores}"
15821580
local gpu_amount
15831581
# gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
15841582
gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
15851583

1584+
# the gpu.amount properties are not appropriate for the version of
1585+
# spark shipped with 1.5 images using the capacity scheduler. TODO:
1586+
# In order to get spark rapids GPU accelerated SQL working on 1.5
1587+
# images, we must configure the Fair scheduler
1588+
version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" || return
1589+
15861590
# TODO: when running this script to customize an image, this file
15871591
# needs to be written *after* bdutil completes
15881592

@@ -2044,7 +2048,7 @@ function exit_handler() {
20442048
if ${gsutil_stat_cmd} "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi
20452049
fi
20462050

2047-
set +ex
2051+
set +e
20482052
echo "Exit handler invoked"
20492053

20502054
# Clear pip cache

gpu/test_gpu.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,6 @@ def verify_driver_signature(self, name):
180180
def test_install_gpu_without_agent(self, configuration, machine_suffixes,
181181
master_accelerator, worker_accelerator,
182182
driver_provider):
183-
self.skipTest("No need to regularly test not installing the agent")
184-
185183
metadata = "install-gpu-agent=false"
186184
if configuration == 'SINGLE' \
187185
and self.getImageOs() == 'rocky' \

0 commit comments

Comments
 (0)