-
Notifications
You must be signed in to change notification settings - Fork 69
Add fast-track to skip uninstall/install if NVIDIA driver modules present #454
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7c178f6
a8dbb15
ba7e6de
0a036ed
d4a6dff
b660caa
2087341
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,12 +8,14 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid | |
| DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"} | ||
| KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver | ||
| NUM_VGPU_DEVICES=0 | ||
| DRIVER_TYPE="${DRIVER_TYPE:-passthrough}" | ||
| GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" | ||
| USE_HOST_MOFED="${USE_HOST_MOFED:-false}" | ||
| NVIDIA_MODULE_PARAMS=() | ||
| NVIDIA_UVM_MODULE_PARAMS=() | ||
| NVIDIA_MODESET_MODULE_PARAMS=() | ||
| NVIDIA_PEERMEM_MODULE_PARAMS=() | ||
| TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} | ||
| USE_HOST_MOFED="${USE_HOST_MOFED:-false}" | ||
| DNF_RELEASEVER=${DNF_RELEASEVER:-""} | ||
| RHEL_VERSION=${RHEL_VERSION:-""} | ||
| RHEL_MAJOR_VERSION=9 | ||
|
|
@@ -398,44 +400,7 @@ _load_driver() { | |
| set +o xtrace -o nounset | ||
| fi | ||
|
|
||
| echo "Starting NVIDIA persistence daemon..." | ||
| nvidia-persistenced --persistence-mode | ||
|
|
||
| if [ "${DRIVER_TYPE}" = "vgpu" ]; then | ||
| echo "Copying gridd.conf..." | ||
| cp /drivers/gridd.conf /etc/nvidia/gridd.conf | ||
| if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then | ||
| echo "Copying ClientConfigToken..." | ||
| mkdir -p /etc/nvidia/ClientConfigToken/ | ||
| cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ | ||
| fi | ||
|
|
||
| echo "Starting nvidia-gridd.." | ||
| LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd | ||
|
|
||
| # Start virtual topology daemon | ||
| _start_vgpu_topology_daemon | ||
| fi | ||
|
|
||
| if _assert_nvlink5_system; then | ||
| _ensure_nvlink5_prerequisites || return 1 | ||
| echo "Starting NVIDIA fabric manager daemon for NVLink5+..." | ||
|
|
||
| fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg | ||
| fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid | ||
| nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf | ||
| nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid | ||
| /usr/bin/nvidia-fabricmanager-start.sh --mode start \ | ||
| --fm-config-file $fm_config_file \ | ||
| --fm-pid-file $fm_pid_file \ | ||
| --nvlsm-config-file $nvlsm_config_file \ | ||
| --nvlsm-pid-file $nvlsm_pid_file | ||
|
|
||
| # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches | ||
| elif _assert_nvswitch_system; then | ||
| echo "Starting NVIDIA fabric manager daemon..." | ||
| nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg | ||
| fi | ||
| _start_daemons | ||
| } | ||
|
|
||
| # Stop persistenced and unload the kernel modules if they are currently loaded. | ||
|
|
@@ -477,6 +442,21 @@ _unload_driver() { | |
| fi | ||
| fi | ||
|
|
||
| if [ -f /var/run/nvidia-topologyd/nvidia-topologyd.pid ]; then | ||
| echo "Stopping NVIDIA topology daemon..." | ||
| local pid=$(< /var/run/nvidia-topologyd/nvidia-topologyd.pid) | ||
|
|
||
| kill -SIGTERM "${pid}" | ||
| for i in $(seq 1 50); do | ||
| kill -0 "${pid}" 2> /dev/null || break | ||
| sleep 0.1 | ||
| done | ||
| if [ $i -eq 50 ]; then | ||
| echo "Could not stop NVIDIA topology daemon" >&2 | ||
| return 1 | ||
| fi | ||
| fi | ||
|
|
||
| if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then | ||
| echo "Stopping NVIDIA fabric manager daemon..." | ||
| local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) | ||
|
|
@@ -566,11 +546,7 @@ _install_driver() { | |
| install_args+=("--skip-module-load") | ||
| fi | ||
|
|
||
| IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} | ||
| # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path | ||
| # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point | ||
| # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit | ||
| #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"} | ||
| IGNORE_CC_MISMATCH=1 nvidia-installer --silent --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} | ||
| } | ||
|
|
||
| # Mount the driver rootfs into the run directory with the exception of sysfs. | ||
|
|
@@ -701,6 +677,91 @@ _start_vgpu_topology_daemon() { | |
| nvidia-topologyd | ||
| } | ||
|
|
||
| _start_daemons() { | ||
| echo "Starting NVIDIA persistence daemon..." | ||
| nvidia-persistenced --persistence-mode | ||
|
|
||
| if [ "${DRIVER_TYPE}" = "vgpu" ]; then | ||
| echo "Copying gridd.conf..." | ||
| cp /drivers/gridd.conf /etc/nvidia/gridd.conf | ||
| if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then | ||
| echo "Copying ClientConfigToken..." | ||
| mkdir -p /etc/nvidia/ClientConfigToken/ | ||
| cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ | ||
| fi | ||
|
|
||
| echo "Starting nvidia-gridd.." | ||
| LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd | ||
|
|
||
| # Start virtual topology daemon | ||
| _start_vgpu_topology_daemon | ||
| fi | ||
|
|
||
| if _assert_nvlink5_system; then | ||
| _ensure_nvlink5_prerequisites || return 1 | ||
| echo "Starting NVIDIA fabric manager daemon for NVLink5+..." | ||
|
|
||
| fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg | ||
| fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid | ||
| nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf | ||
| nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid | ||
| /usr/bin/nvidia-fabricmanager-start.sh --mode start \ | ||
| --fm-config-file $fm_config_file \ | ||
| --fm-pid-file $fm_pid_file \ | ||
| --nvlsm-config-file $nvlsm_config_file \ | ||
| --nvlsm-pid-file $nvlsm_pid_file | ||
|
|
||
| # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches | ||
| elif _assert_nvswitch_system; then | ||
| echo "Starting NVIDIA fabric manager daemon..." | ||
| nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg | ||
| fi | ||
| } | ||
|
|
||
| _store_driver_config() { | ||
| local config_file="${RUN_DIR}/nvidia-driver.state" | ||
| echo "Storing driver configuration state..." | ||
| _build_driver_config > "$config_file" | ||
| echo "Driver configuration stored at $config_file" | ||
| } | ||
|
|
||
| _wait_for_signal() { | ||
| echo "Done, now waiting for signal" | ||
| sleep infinity & | ||
| trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM | ||
| trap - EXIT | ||
| while true; do wait $! || continue; done | ||
| exit 0 | ||
| } | ||
|
|
||
| _userspace_only_install() { | ||
| echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" | ||
| _unmount_rootfs | ||
|
|
||
| cd /drivers | ||
| [ ! -d "NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}" ] && sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x | ||
| cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} | ||
|
|
||
| echo "Installing userspace components (libraries and binaries)..." | ||
| local install_args="--silent --no-kernel-module --no-nouveau-check --no-nvidia-modprobe --no-drm --no-peermem --ui=none" | ||
| [ "${ACCEPT_LICENSE}" = "yes" ] && install_args="$install_args --accept-license" | ||
| IGNORE_CC_MISMATCH=1 ./nvidia-installer $install_args | ||
|
|
||
| # Copy kernel module sources if not already present (needed for sidecar containers) | ||
| if [ ! -d "/usr/src/nvidia-${DRIVER_VERSION}" ]; then | ||
| _resolve_kernel_type || exit 1 | ||
| mkdir -p /usr/src/nvidia-${DRIVER_VERSION} | ||
| cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ | ||
| sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest | ||
| fi | ||
|
Comment on lines
+741
to
+756
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question -- isn't this already performed in the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am thinking this method would just turn into: |
||
|
|
||
| _mount_rootfs | ||
| _start_daemons | ||
| _write_kernel_update_hook | ||
| _store_driver_config | ||
| echo "Userspace-only install complete" | ||
| } | ||
|
|
||
| _prepare() { | ||
| if [ "${DRIVER_TYPE}" = "vgpu" ]; then | ||
| _find_vgpu_driver_version || exit 1 | ||
|
|
@@ -758,17 +819,48 @@ _load() { | |
| _load_driver | ||
| _mount_rootfs | ||
| _write_kernel_update_hook | ||
|
|
||
| echo "Done, now waiting for signal" | ||
| sleep infinity & | ||
| trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM | ||
| trap - EXIT | ||
| while true; do wait $! || continue; done | ||
| exit 0 | ||
| _store_driver_config | ||
| _wait_for_signal | ||
| } | ||
|
|
||
| init() { | ||
| _prepare_exclusive | ||
| if [ "${DRIVER_TYPE}" = "vgpu" ]; then | ||
| _find_vgpu_driver_version || exit 1 | ||
| fi | ||
|
|
||
| echo -e "\n========== NVIDIA Software Installer ==========\n" | ||
| echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" | ||
|
|
||
| exec 3> ${PID_FILE} | ||
| if ! flock -n 3; then | ||
| echo "An instance of the NVIDIA driver is already running, aborting" | ||
| exit 1 | ||
| fi | ||
| echo $$ >&3 | ||
|
|
||
| trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM | ||
| trap "_shutdown" EXIT | ||
|
Comment on lines
+827
to
+842
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To make this a bit easier to read, what if we reverted this change and continued to just call Feel free to push back if this complicates things. |
||
|
|
||
| if _should_use_fast_path; then | ||
| _userspace_only_install | ||
| _wait_for_signal | ||
| fi | ||
|
|
||
| _unload_driver || exit 1 | ||
| _unmount_rootfs | ||
|
|
||
| # Install the userspace components and copy the kernel module sources. | ||
| sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ | ||
| cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ | ||
| sh /tmp/install.sh nvinstall | ||
|
|
||
| # Determine the kernel module type | ||
| _resolve_kernel_type || exit 1 | ||
|
|
||
| # Copy the kernel module sources | ||
| mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ | ||
| mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \ | ||
| sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest | ||
|
|
||
| _build | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The permissions of this file have been changed.