diff --git a/.common-ci.yml b/.common-ci.yml index 48ee7129b..8a2a2e162 100644 --- a/.common-ci.yml +++ b/.common-ci.yml @@ -122,6 +122,10 @@ trigger-pipeline: variables: DIST: rhel9 +.dist-rhel10: + variables: + DIST: rhel10 + .dist-fedora36: variables: DIST: fedora36 @@ -207,6 +211,14 @@ trigger-pipeline: rules: - if: $CI_PIPELINE_SOURCE != "schedule" && $CI_COMMIT_TAG == null +.release-rhel10: + # Perform for each DRIVER_VERSION + extends: + - .release-generic + - .driver-versions + rules: + - if: $CI_PIPELINE_SOURCE != "schedule" && $CI_COMMIT_TAG == null + .release: # Perform for each DRIVER_VERSION extends: @@ -254,6 +266,15 @@ trigger-pipeline: OUT_REGISTRY: "${NGC_REGISTRY}" OUT_IMAGE_NAME: "${NGC_STAGING_REGISTRY}/driver" +.release:staging-rhel10: + extends: + - .release-rhel10 + variables: + OUT_REGISTRY_USER: "${NGC_REGISTRY_USER}" + OUT_REGISTRY_TOKEN: "${NGC_REGISTRY_TOKEN}" + OUT_REGISTRY: "${NGC_REGISTRY}" + OUT_IMAGE_NAME: "${NGC_STAGING_REGISTRY}/driver" + # Define an external release step that pushes an image to an external repository. .release:external: extends: @@ -322,6 +343,13 @@ release:staging-rhel9: needs: - image-rhel9 +release:staging-rhel10: + extends: + - .release:staging + - .dist-rhel10 + needs: + - image-rhel10 + .release:staging-precompiled: stage: release variables: diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index 4d1c3a3e5..f98ade7a4 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -36,11 +36,16 @@ jobs: - ubuntu24.04 - rhel8 - rhel9 + - rhel10 ispr: - ${{github.event_name == 'pull_request'}} exclude: - dist: ubuntu24.04 driver: 535.274.02 + - dist: rhel10 + driver: 535.274.02 + - dist: rhel10 + driver: 570.195.03 fail-fast: false steps: - uses: actions/checkout@v5 @@ -128,6 +133,10 @@ jobs: dist: ubuntu24.04 - flavor: azure-fde dist: ubuntu22.04 + - dist: rhel10 + driver: 535 + - dist: rhel10 + driver: 570 steps: - uses: actions/checkout@v5 name: Check out code diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8782e3439..d65ffacf8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -67,6 +67,14 @@ include: rules: - if: $CI_PIPELINE_SOURCE != "schedule" +# Define the image build targets +.image-build-rhel10: + # Perform for each DRIVER_VERSION + extends: + - .driver-versions + - .image-build-generic + rules: + - if: $CI_PIPELINE_SOURCE != "schedule" image-ubuntu20.04: extends: @@ -93,6 +101,11 @@ image-rhel9: - .image-build-rhel9 - .dist-rhel9 +image-rhel10: + extends: + - .image-build-rhel10 + - .dist-rhel10 + image-fedora36: extends: - .image-build diff --git a/.nvidia-ci.yml b/.nvidia-ci.yml index fdaf8d0b8..a21d6cd41 100644 --- a/.nvidia-ci.yml +++ b/.nvidia-ci.yml @@ -146,6 +146,11 @@ image-rhel9: - .image-pull - .dist-rhel9 +image-rhel10: + extends: + - .image-pull + - .dist-rhel10 + # The .scan step forms the base of the image scan operation performed before releasing # images. .scan-generic: @@ -333,6 +338,22 @@ scan-rhel9-arm64: needs: - image-rhel9 +scan-rhel10-amd64: + extends: + - .scan + - .dist-rhel10 + - .platform-amd64 + needs: + - image-rhel10 + +scan-rhel10-arm64: + extends: + - .scan + - .dist-rhel10 + - .platform-arm64 + needs: + - image-rhel10 + .release:ngc-variables: variables: OUT_REGISTRY_USER: "${NGC_REGISTRY_USER}" @@ -438,6 +459,20 @@ release:ngc-rhel9.6: variables: OUT_DIST: "rhel9.6" +release:ngc-rhel10.0: + extends: + - .release:ngc + - .dist-rhel10 + variables: + OUT_DIST: "rhel10.0" + +release:ngc-rhel10.1: + extends: + - .release:ngc + - .dist-rhel10 + variables: + OUT_DIST: "rhel10.1" + generate-build-info: stage: ngc-publish artifacts: diff --git a/Makefile b/Makefile index ee633db42..d1a45b015 100644 --- a/Makefile +++ b/Makefile @@ -54,7 +54,7 @@ OUT_IMAGE_TAG = $(OUT_IMAGE_VERSION)-$(OUT_DIST) OUT_IMAGE = $(OUT_IMAGE_NAME):$(OUT_IMAGE_TAG) ##### Public rules ##### -DISTRIBUTIONS := ubuntu18.04 ubuntu20.04 ubuntu22.04 ubuntu24.04 signed_ubuntu20.04 signed_ubuntu22.04 signed_ubuntu24.04 rhel8 rhel9 flatcar fedora36 sles15.3 precompiled_rhcos +DISTRIBUTIONS := ubuntu18.04 ubuntu20.04 ubuntu22.04 ubuntu24.04 signed_ubuntu20.04 signed_ubuntu22.04 signed_ubuntu24.04 rhel8 rhel9 rhel10 flatcar fedora36 sles15.3 precompiled_rhcos RHCOS_VERSIONS := rhcos4.14 rhcos4.15 rhcos4.16 rhcos4.17 rhcos4.18 rhel9.6 PUSH_TARGETS := $(patsubst %, push-%, $(DISTRIBUTIONS)) BASE_FROM := noble jammy focal diff --git a/ci/localbuild.sh b/ci/localbuild.sh index b176360b0..d77b7b55c 100755 --- a/ci/localbuild.sh +++ b/ci/localbuild.sh @@ -59,6 +59,7 @@ driver_container_build_rhel() driver_container_build_simple "rhel7" driver_container_build_simple "rhel8" driver_container_build_simple "rhel9" + driver_container_build_simple "rhel10" } list_all_containers() diff --git a/ci/run.sh b/ci/run.sh index 0728e8fb8..96243f693 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -167,6 +167,7 @@ done build "rhel7" "${CONTAINER_VERSION}-rhel7" "$(mk_short_version rhel7)" "" build "rhel8" "${CONTAINER_VERSION}-rhel8" "$(mk_short_version rhel8)" "" build "rhel9" "${CONTAINER_VERSION}-rhel9" "$(mk_short_version rhel9)" "" +build "rhel10" "${CONTAINER_VERSION}-rhel10" "$(mk_short_version rhel10)" "" # Add rhcos tags docker pull "${REGISTRY}:${CONTAINER_VERSION}-rhel8" diff --git a/rhel10/Dockerfile b/rhel10/Dockerfile new file mode 100644 index 000000000..9aa0dc0de --- /dev/null +++ b/rhel10/Dockerfile @@ -0,0 +1,114 @@ +ARG BASE_IMAGE=nvcr.io/nvidia/cuda:13.0.1-base-ubi10 + +FROM ${BASE_IMAGE} as build + +ARG TARGETARCH +ARG GOLANG_VERSION + +# Arg to indicate if driver type is either of passthrough(baremetal) or vgpu +ARG DRIVER_TYPE=passthrough +ENV DRIVER_TYPE=$DRIVER_TYPE + +SHELL ["/bin/bash", "-c"] + +RUN dnf install -y git wget + +# download appropriate binary based on the target architecture for multi-arch builds +RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \ + wget -nv -O - https://go.dev/dl/go${GOLANG_VERSION}.linux-${OS_ARCH}.tar.gz \ + | tar -C /usr/local -xz + +ENV PATH /usr/local/go/bin:$PATH + +WORKDIR /work + +RUN if [ "$DRIVER_TYPE" = "vgpu" ]; then \ + git clone https://github.com/NVIDIA/gpu-driver-container driver && \ + cd driver/vgpu/src && \ + go build -o vgpu-util && \ + mv vgpu-util /work; fi + +FROM ${BASE_IMAGE} + +ARG TARGETARCH +ENV TARGETARCH=$TARGETARCH + +SHELL ["/bin/bash", "-c"] + +#ARG BASE_URL=http://us.download.nvidia.com/XFree86/Linux-x86_64 +ARG BASE_URL=https://us.download.nvidia.com/tesla +ARG DRIVER_VERSION +ENV DRIVER_VERSION=$DRIVER_VERSION +ARG DRIVER_BRANCH +ENV DRIVER_BRANCH=$DRIVER_BRANCH + +# Arg to indicate if driver type is either of passthrough/baremetal or vgpu +ARG DRIVER_TYPE=passthrough +ENV DRIVER_TYPE=$DRIVER_TYPE +ARG VGPU_LICENSE_SERVER_TYPE=NLS +ENV VGPU_LICENSE_SERVER_TYPE=$VGPU_LICENSE_SERVER_TYPE +# Enable vGPU version compability check by default +ARG DISABLE_VGPU_VERSION_CHECK=true +ENV DISABLE_VGPU_VERSION_CHECK=$DISABLE_VGPU_VERSION_CHECK +# Avoid dependency of container-toolkit for driver container +ENV NVIDIA_VISIBLE_DEVICES=void + +ADD install.sh /tmp/ + +RUN NVIDIA_GPGKEY_SUM=afbea87d3b979b3788ef34223aeeb323ade481128e2c133723ae99b8a51368bb && \ + OS_ARCH=${TARGETARCH/amd64/x86_64} && OS_ARCH=${OS_ARCH/arm64/sbsa} && \ + curl -fsSL "https://developer.download.nvidia.com/compute/cuda/repos/rhel10/$OS_ARCH/CDF6BA43.pub" | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ + echo "$NVIDIA_GPGKEY_SUM /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict - + +RUN sh /tmp/install.sh depinstall && \ + sh /tmp/install.sh setup_cuda_repo && \ + curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \ + curl -fsSL -o /usr/local/bin/extract-vmlinux https://raw.githubusercontent.com/torvalds/linux/master/scripts/extract-vmlinux && \ + chmod +x /usr/local/bin/donkey /usr/local/bin/extract-vmlinux && \ + ln -s /sbin/ldconfig /sbin/ldconfig.real + +ADD drivers drivers/ + +# Fetch the installer, fabricmanager and libnvidia-nscq automatically for passthrough/baremetal types +RUN if [ "$DRIVER_TYPE" != "vgpu" ]; then \ + cd drivers && \ + DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} && \ + curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run && \ + chmod +x NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run; fi + +# Fetch the installer, fabricmanager, libnvidia-nscq, libnvsdm, imex packages +RUN sh /tmp/install.sh extrapkgsinstall + +COPY nvidia-driver /usr/local/bin +COPY ocp_dtk_entrypoint /usr/local/bin +COPY common.sh /usr/local/bin + +COPY --from=build /work/vgpu-util* /usr/local/bin + +WORKDIR /drivers + +ARG PUBLIC_KEY=empty +COPY ${PUBLIC_KEY} kernel/pubkey.x509 + +ARG PRIVATE_KEY +ARG KERNEL_VERSION=latest + +LABEL io.k8s.display-name="NVIDIA Driver Container" +LABEL name="NVIDIA Driver Container" +LABEL vendor="NVIDIA" +LABEL version="${DRIVER_VERSION}" +LABEL release="N/A" +LABEL summary="Provision the NVIDIA driver through containers" +LABEL description="See summary" + +# Install / upgrade packages here that are required to resolve CVEs +ARG CVE_UPDATES +RUN if [ -n "${CVE_UPDATES}" ]; then \ + yum update -y ${CVE_UPDATES} && \ + rm -rf /var/cache/yum/*; \ + fi + +# Remove cuda repository to avoid GPG errors +RUN rm -f /etc/yum.repos.d/cuda.repo + +ENTRYPOINT ["nvidia-driver", "init"] diff --git a/rhel10/LICENSE b/rhel10/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/rhel10/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/rhel10/README.md b/rhel10/README.md new file mode 100644 index 000000000..bf7ce948c --- /dev/null +++ b/rhel10/README.md @@ -0,0 +1,3 @@ +# RHEL10 [![build status](https://gitlab.com/nvidia/driver/badges/master/build.svg)](https://gitlab.com/nvidia/driver/commits/master) + +See https://github.com/NVIDIA/nvidia-docker/wiki/Driver-containers-(Beta) diff --git a/rhel10/common.sh b/rhel10/common.sh new file mode 100755 index 000000000..a41a14a12 --- /dev/null +++ b/rhel10/common.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. + +GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" +GDS_ENABLED="${GDS_ENABLED:-false}" +GDRCOPY_ENABLED="${GDRCOPY_ENABLED:-false}" + +# Check if mellanox devices are present +_mellanox_devices_present() { + devices_found=0 + for dev in /sys/bus/pci/devices/*; do + read vendor < $dev/vendor + if [ "$vendor" = "0x15b3" ]; then + echo "Mellanox device found at $(basename $dev)" + return 0 + fi + done + echo "No Mellanox devices were found..." + return 1 +} + +# Check if GPU Direct RDMA is enabled +_gpu_direct_rdma_enabled() { + if [ "${GPU_DIRECT_RDMA_ENABLED}" = "true" ]; then + # check if mellanox cards are present + if _mellanox_devices_present; then + return 0 + fi + fi + return 1 +} + +# Check if GDS is enabled +_gpu_direct_storage_enabled() { + if [ "${GDS_ENABLED}" = "true" ]; then + return 0 + fi + return 1 +} + +# Check if GDRCopy is enabled +_gdrcopy_enabled() { + if [ "${GDRCOPY_ENABLED}" = "true" ]; then + return 0 + fi + return 1 +} diff --git a/rhel10/drivers/README.md b/rhel10/drivers/README.md new file mode 100644 index 000000000..ddc27b5c9 --- /dev/null +++ b/rhel10/drivers/README.md @@ -0,0 +1 @@ +# Folder for downloading vGPU drivers and dependent metadata files \ No newline at end of file diff --git a/rhel10/empty b/rhel10/empty new file mode 100644 index 000000000..e69de29bb diff --git a/rhel10/install.sh b/rhel10/install.sh new file mode 100755 index 000000000..4ea0b618d --- /dev/null +++ b/rhel10/install.sh @@ -0,0 +1,158 @@ +#!/bin/bash +# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. + +set -eu + +DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} +echo "DRIVER_ARCH is $DRIVER_ARCH" + +dep_installer () { + if [ "$DRIVER_ARCH" = "x86_64" ]; then + dnf install -y \ + libglvnd-glx \ + ca-certificates \ + curl-minimal \ + gcc \ + glibc \ + make \ + cpio \ + kmod + elif [ "$DRIVER_ARCH" = "ppc64le" ]; then + dnf install -y \ + libglvnd-glx \ + ca-certificates \ + curl-minimal \ + gcc \ + glibc \ + make \ + cpio \ + kmod + elif [ "$DRIVER_ARCH" = "aarch64" ]; then + dnf install -y \ + libglvnd-glx \ + ca-certificates \ + curl-minimal \ + gcc \ + glibc \ + make \ + cpio \ + kmod + fi + + rm -rf /var/cache/yum/* +} + +nvidia_installer () { + if [ "$DRIVER_ARCH" = "x86_64" ]; then + ./nvidia-installer --silent \ + --no-kernel-module \ + --install-compat32-libs \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-rpms \ + --no-backup \ + --no-check-for-alternate-installs \ + --no-libglx-indirect \ + --no-install-libglvnd \ + --x-prefix=/tmp/null \ + --x-module-path=/tmp/null \ + --x-library-path=/tmp/null \ + --x-sysconfig-path=/tmp/null + elif [ "$DRIVER_ARCH" = "ppc64le" ]; then + ./nvidia-installer --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-rpms \ + --no-backup \ + --no-check-for-alternate-installs \ + --no-libglx-indirect \ + --no-install-libglvnd \ + --x-prefix=/tmp/null \ + --x-module-path=/tmp/null \ + --x-library-path=/tmp/null \ + --x-sysconfig-path=/tmp/null + elif [ "$DRIVER_ARCH" = "aarch64" ]; then + ./nvidia-installer --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-rpms \ + --no-backup \ + --no-check-for-alternate-installs \ + --no-libglx-indirect \ + --no-install-libglvnd \ + --x-prefix=/tmp/null \ + --x-module-path=/tmp/null \ + --x-library-path=/tmp/null \ + --x-sysconfig-path=/tmp/null + else + echo "DRIVER_ARCH doesn't match a known arch target" + fi +} + +fabricmanager_install() { + if [ "$DRIVER_BRANCH" -ge "590" ]; then + dnf install -y nvidia-fabricmanager-${DRIVER_VERSION} + fi +} + +nscq_install() { + if [ "$DRIVER_BRANCH" -ge "590" ]; then + dnf install -y libnvidia-nscq-${DRIVER_VERSION} + fi +} + +# libnvsdm packages are not available for arm64 +nvsdm_install() { + if [ "$TARGETARCH" = "amd64" ]; then + if [ "$DRIVER_BRANCH" -ge "590" ]; then + dnf install -y libnvsdm-${DRIVER_VERSION} + return 0 + fi + if [ "$DRIVER_BRANCH" -ge "590" ]; then + dnf install -y libnvsdm-${DRIVER_BRANCH}-${DRIVER_VERSION} + return 0 + fi + fi +} + +nvlink5_pkgs_install() { + if [ "$DRIVER_BRANCH" -ge "550" ]; then + dnf install -y infiniband-diags nvlsm + fi +} + +imex_install() { + if [ "$DRIVER_BRANCH" -ge "590" ]; then + dnf install -y nvidia-imex-${DRIVER_VERSION} + fi +} + +extra_pkgs_install() { + if [ "$DRIVER_TYPE" != "vgpu" ]; then + + fabricmanager_install + nscq_install + nvsdm_install + nvlink5_pkgs_install + imex_install + fi +} + +setup_cuda_repo() { + OS_ARCH=${TARGETARCH/amd64/x86_64} && OS_ARCH=${OS_ARCH/arm64/sbsa}; + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel10/${OS_ARCH}/cuda-rhel10.repo +} + +if [ "$1" = "nvinstall" ]; then + nvidia_installer +elif [ "$1" = "depinstall" ]; then + dep_installer +elif [ "$1" = "extrapkgsinstall" ]; then + extra_pkgs_install +elif [ "$1" = "setup_cuda_repo" ]; then + setup_cuda_repo +else + echo "Unknown function: $1" +fi diff --git a/rhel10/nvidia-driver b/rhel10/nvidia-driver new file mode 100755 index 000000000..77e82611f --- /dev/null +++ b/rhel10/nvidia-driver @@ -0,0 +1,927 @@ +#! /bin/bash -x +# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + +set -eu + +RUN_DIR=/run/nvidia +PID_FILE=${RUN_DIR}/${0##*/}.pid +DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"} +KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver +NUM_VGPU_DEVICES=0 +NVIDIA_MODULE_PARAMS=() +NVIDIA_UVM_MODULE_PARAMS=() +NVIDIA_MODESET_MODULE_PARAMS=() +NVIDIA_PEERMEM_MODULE_PARAMS=() +TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} +USE_HOST_MOFED="${USE_HOST_MOFED:-false}" +DNF_RELEASEVER=${DNF_RELEASEVER:-""} +RHEL_VERSION=${RHEL_VERSION:-""} +RHEL_MAJOR_VERSION=10 +RHEL_MINOR_VERSION=${RHEL_MINOR_VERSION:-""} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} + +DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} +echo "DRIVER_ARCH is $DRIVER_ARCH" + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source $SCRIPT_DIR/common.sh + +_update_package_cache() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + echo "Updating the package cache..." + if ! yum -q makecache; then + echo "FATAL: failed to reach RHEL package repositories. "\ + "Ensure that the cluster can access the proper networks." + exit 1 + fi + fi +} + +_cleanup_package_cache() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + echo "Cleaning up the package cache..." + rm -rf /var/cache/yum/* + fi +} + +_get_rhel_version_from_kernel() { + local rhel_version_underscore rhel_version_arr + rhel_version_underscore=$(echo "${KERNEL_VERSION}" | sed 's/.*el\([0-9]\+_[0-9]\+\).*/\1/g') + # For e.g. :- from the kernel version 4.18.0-513.9.1.el8_9, we expect to extract the string "8_9" + if [[ ! ${rhel_version_underscore} =~ ^[0-9]+_[0-9]+$ ]]; then + echo "Unable to resolve RHEL version from kernel version" >&2 + return 1 + fi + IFS='_' read -r -a rhel_version_arr <<< "$rhel_version_underscore" + if [[ ${#rhel_version_arr[@]} -ne 2 ]]; then + echo "Unable to resolve RHEL version from kernel version" >&2 + return 1 + fi + RHEL_VERSION="${rhel_version_arr[0]}.${rhel_version_arr[1]}" + RHEL_MINOR_VERSION=${rhel_version_arr[1]} + echo "RHEL VERSION successfully resolved from kernel: ${RHEL_VERSION}" + return 0 +} + +_resolve_rhel_version() { + _get_rhel_version_from_kernel || RHEL_VERSION="${RHEL_MAJOR_VERSION}" + # set dnf release version as rhel version by default + if [[ -z "${DNF_RELEASEVER}" ]]; then + DNF_RELEASEVER="${RHEL_VERSION}" + fi + return 0 +} + +# Resolve the kernel version to the form major.minor.patch-revision. +_resolve_kernel_version() { + echo "Resolving Linux kernel version..." + local version=$(yum -q list available --showduplicates kernel-headers | + awk -v arch=$(uname -m) 'NR>1 {print $2"."arch}' | tac | grep -E -m1 "^${KERNEL_VERSION/latest/.*}") + + if [ -z "${version}" ]; then + echo "Could not resolve Linux kernel version" >&2 + return 1 + fi + KERNEL_VERSION="${version}" + echo "Proceeding with Linux kernel version ${KERNEL_VERSION}" + return 0 +} + +# Install the kernel modules header/builtin/order files and generate the kernel version string. +_install_prerequisites() ( + local tmp_dir=$(mktemp -d) + + trap "rm -rf ${tmp_dir}" EXIT + cd ${tmp_dir} + + rm -rf /lib/modules/${KERNEL_VERSION} + mkdir -p /lib/modules/${KERNEL_VERSION}/proc + + echo "Enabling RHOCP and EUS RPM repos..." + if [ -n "${OPENSHIFT_VERSION:-}" ]; then + dnf config-manager --set-enabled rhocp-${OPENSHIFT_VERSION}-for-rhel-10-$DRIVER_ARCH-rpms || true + if ! dnf makecache --releasever=${DNF_RELEASEVER}; then + dnf config-manager --set-disabled rhocp-${OPENSHIFT_VERSION}-for-rhel-10-$DRIVER_ARCH-rpms || true + fi + fi + + dnf config-manager --set-enabled rhel-10-for-$DRIVER_ARCH-baseos-eus-rpms || true + if ! dnf makecache --releasever=${DNF_RELEASEVER}; then + dnf config-manager --set-disabled rhel-10-for-$DRIVER_ARCH-baseos-eus-rpms || true + fi + + # try with EUS disabled, if it does not work, then try just major version + if ! dnf makecache --releasever=${DNF_RELEASEVER}; then + # If pointing to DNF_RELEASEVER does not work, we point to the RHEL_MAJOR_VERSION as a last resort + if ! dnf makecache --releasever=${RHEL_MAJOR_VERSION}; then + echo "FATAL: failed to update the dnf metadata cache after multiple attempts with releasevers ${DNF_RELEASEVER}, ${RHEL_MAJOR_VERSION}" + exit 1 + else + DNF_RELEASEVER=${RHEL_MAJOR_VERSION} + fi + fi + + echo "Installing elfutils..." + dnf install -q -y --releasever=${DNF_RELEASEVER} elfutils-libelf.$DRIVER_ARCH + if ! dnf install -y --releasever=${DNF_RELEASEVER} elfutils-libelf-devel.$DRIVER_ARCH; then + echo "FATAL: failed to install elfutils-libel-devel. RHEL entitlement may be improperly deployed." + exit 1 + fi + + echo "Installing Linux kernel headers..." + dnf -q -y --releasever=${DNF_RELEASEVER} install kernel-headers-${KERNEL_VERSION} kernel-devel-${KERNEL_VERSION} > /dev/null + ln -s /usr/src/kernels/${KERNEL_VERSION} /lib/modules/${KERNEL_VERSION}/build + + echo "Installing Linux kernel module files..." + dnf -q -y --releasever=${DNF_RELEASEVER} install kernel-${KERNEL_VERSION} > /dev/null + + # Prevent depmod from giving a WARNING about missing files + touch /lib/modules/${KERNEL_VERSION}/modules.order + touch /lib/modules/${KERNEL_VERSION}/modules.builtin + + depmod ${KERNEL_VERSION} + + echo "Generating Linux kernel version string..." + extract-vmlinux /lib/modules/${KERNEL_VERSION}/vmlinuz | strings | grep -E '^Linux version' | sed 's/^\(.*\)\s\+(.*)$/\1/' > version + + if [ -z "$( version + fi + + if [ -z "$(&2 + return 1 + fi + mv version /lib/modules/${KERNEL_VERSION}/proc + + # Parse gcc version + # gcc_version is expected to match x.y.z + # current_gcc is expected to match 'gcc-x.y.z-rel.el8.x86_64 + local gcc_version=$(cat /lib/modules/${KERNEL_VERSION}/proc/version | grep -Eo "gcc \(GCC\) ([0-9\.]+)" | grep -Eo "([0-9\.]+)") + local current_gcc=$(rpm -qa gcc) + echo "kernel requires gcc version: 'gcc-${gcc_version}', current gcc version is '${current_gcc}'" + + if ! [[ "${current_gcc}" =~ "gcc-${gcc_version}"-.* ]]; then + dnf install -q -y --releasever=${DNF_RELEASEVER} "gcc-${gcc_version}" + fi +) + +# Cleanup the prerequisites installed above. +_remove_prerequisites() { + true + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + dnf -q -y remove kernel-headers-${KERNEL_VERSION} kernel-devel-${KERNEL_VERSION} > /dev/null + # TODO remove module files not matching an existing driver package. + fi +} + +# Check if the kernel version requires a new precompiled driver packages. +_kernel_requires_package() { + local proc_mount_arg="" + + echo "Checking NVIDIA driver packages..." + + [[ ! -d /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} ]] && return 0 + cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} + + proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc" + for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do + is_match=$(../mkprecompiled --match ${pkg_name} ${proc_mount_arg}) + if [ "${is_match}" == "kernel interface matches." ]; then + echo "Found NVIDIA driver package ${pkg_name##*/}" + return 1 + fi + done + return 0 +} + +# Compile the kernel modules, optionally sign them, and generate a precompiled package for use by the nvidia-installer. +_create_driver_package() ( + local pkg_name="nvidia-modules-${KERNEL_VERSION%%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}" + local nvidia_sign_args="" + local nvidia_modeset_sign_args="" + local nvidia_uvm_sign_args="" + + trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT + + echo "Compiling NVIDIA driver kernel modules..." + cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} + + if _gpu_direct_rdma_enabled; then + ln -s /run/mellanox/drivers/usr/src/ofa_kernel /usr/src/ + # if arch directory exists(MOFED >=5.5) then create a symlink as expected by GPU driver installer + # This is required as currently GPU driver installer doesn't expect headers in x86_64 folder, but only in either default or kernel-version folder. + # ls -ltr /usr/src/ofa_kernel/ + # lrwxrwxrwx 1 root root 36 Dec 8 20:10 default -> /etc/alternatives/ofa_kernel_headers + # drwxr-xr-x 4 root root 4096 Dec 8 20:14 x86_64 + # lrwxrwxrwx 1 root root 44 Dec 9 19:05 5.4.0-90-generic -> /usr/src/ofa_kernel/x86_64/5.4.0-90-generic/ + if [[ -d "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" ]]; then + if [[ ! -e "/usr/src/ofa_kernel/$(uname -r)" ]]; then + ln -s "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" /usr/src/ofa_kernel/ + fi + fi + fi + + make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null + + echo "Relinking NVIDIA driver kernel modules..." + rm -f nvidia.ko nvidia-modeset.ko + ld -d -r -o nvidia.ko ./nv-linux.o ./nvidia/nv-kernel.o_binary + ld -d -r -o nvidia-modeset.ko ./nv-modeset-linux.o ./nvidia-modeset/nv-modeset-kernel.o_binary + + if [ -n "${PRIVATE_KEY}" ]; then + echo "Signing NVIDIA driver kernel modules..." + donkey get ${PRIVATE_KEY} sh -c "PATH=${PATH}:/usr/src/linux-headers-${KERNEL_VERSION}/scripts && \ + sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia.ko nvidia.ko.sign && \ + sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-modeset.ko nvidia-modeset.ko.sign && \ + sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-uvm.ko" + nvidia_sign_args="--linked-module nvidia.ko --signed-module nvidia.ko.sign" + nvidia_modeset_sign_args="--linked-module nvidia-modeset.ko --signed-module nvidia-modeset.ko.sign" + nvidia_uvm_sign_args="--signed" + fi + + echo "Building NVIDIA driver package ${pkg_name}..." + ../mkprecompiled --pack ${pkg_name} --description ${KERNEL_VERSION} \ + --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc \ + --driver-version ${DRIVER_VERSION} \ + --kernel-interface nv-linux.o \ + --linked-module-name nvidia.ko \ + --core-object-name nvidia/nv-kernel.o_binary \ + ${nvidia_sign_args} \ + --target-directory . \ + --kernel-interface nv-modeset-linux.o \ + --linked-module-name nvidia-modeset.ko \ + --core-object-name nvidia-modeset/nv-modeset-kernel.o_binary \ + ${nvidia_modeset_sign_args} \ + --target-directory . \ + --kernel-module nvidia-uvm.ko \ + ${nvidia_uvm_sign_args} \ + --target-directory . + mkdir -p precompiled + mv ${pkg_name} precompiled +) + +_assert_nvswitch_system() { + [ -d /proc/driver/nvidia-nvswitch ] || return 1 + entries=$(ls -1 /proc/driver/nvidia-nvswitch/devices/*) + if [ -z "${entries}" ]; then + return 1 + fi + return 0 +} + +_assert_nvlink5_system() ( + for dir in /sys/class/infiniband/*/device; do + # Define the path to the VPD file + vpd_file="$dir/vpd" + + # Check if the VPD file exists + if [ -f "$vpd_file" ]; then + # Search for 'SW_MNG' in the VPD file + if grep -q "SW_MNG" "$vpd_file"; then + echo "Detected NVLink5+ system" + return 0 + fi + fi + done + return 1 +) + +_ensure_nvlink5_prerequisites() ( + until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1; + do + echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded" + sleep 10 + done +) + +# For each kernel module configuration file mounted into the container, +# parse the file contents and extract the custom module parameters that +# are to be passed as input to 'modprobe'. +# +# Assumptions: +# - Configuration files are named .conf (i.e. nvidia.conf, nvidia-uvm.conf). +# - Configuration files are mounted inside the container at /drivers. +# - Each line in the file contains at least one parameter, where parameters on the same line +# are space delimited. It is up to the user to properly format the file to ensure +# the correct set of parameters are passed to 'modprobe'. +_get_module_params() { + local base_path="/drivers" + + # Starting from R580, we need to enable the CDMM (Coherent Driver Memory Management) module parameter. + # This prevents the GPU memory for coherent systems (GH200, GB200 etc) from being exposed as a NUMA node + # and thereby preventing over-reporting of a Kubernetes node's memory. This is needed for Kubernetes use-cases + NVIDIA_MODULE_PARAMS+=("NVreg_CoherentGPUMemoryMode=driver") + + # nvidia + if [ -f "${base_path}/nvidia.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia.conf" + echo "Module parameters provided for nvidia: ${NVIDIA_MODULE_PARAMS[@]}" + fi + # nvidia-uvm + if [ -f "${base_path}/nvidia-uvm.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_UVM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-uvm.conf" + echo "Module parameters provided for nvidia-uvm: ${NVIDIA_UVM_MODULE_PARAMS[@]}" + fi + # nvidia-modeset + if [ -f "${base_path}/nvidia-modeset.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODESET_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-modeset.conf" + echo "Module parameters provided for nvidia-modeset: ${NVIDIA_MODESET_MODULE_PARAMS[@]}" + fi + # nvidia-peermem + if [ -f "${base_path}/nvidia-peermem.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_PEERMEM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-peermem.conf" + echo "Module parameters provided for nvidia-peermem: ${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + fi +} + +# Load the kernel modules and start persistenced. +_load_driver() { + echo "Parsing kernel module parameters..." + _get_module_params + + local nv_fw_search_path="$RUN_DIR/driver/lib/firmware" + local set_fw_path="true" + local fw_path_config_file="/sys/module/firmware_class/parameters/path" + for param in "${NVIDIA_MODULE_PARAMS[@]}"; do + if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then + set_fw_path="false" + fi + done + + if [[ "$set_fw_path" == "true" ]]; then + echo "Configuring the following firmware search path in '$fw_path_config_file': $nv_fw_search_path" + if [[ ! -z $(grep '[^[:space:]]' $fw_path_config_file) ]]; then + echo "WARNING: A search path is already configured in $fw_path_config_file" + echo " Retaining the current configuration" + else + echo -n "$nv_fw_search_path" > $fw_path_config_file || echo "WARNING: Failed to configure the firmware search path" + fi + fi + + echo "Loading ipmi and i2c_core kernel modules..." + modprobe -a i2c_core ipmi_msghandler ipmi_devintf + + if [[ "$RHEL_MINOR_VERSION" -ge "3" ]]; then + echo "Loading the video kernel module..." + modprobe video + fi + + echo "Loading NVIDIA driver kernel modules..." + set -o xtrace +o nounset + modprobe nvidia "${NVIDIA_MODULE_PARAMS[@]}" + modprobe nvidia-uvm "${NVIDIA_UVM_MODULE_PARAMS[@]}" + modprobe nvidia-modeset "${NVIDIA_MODESET_MODULE_PARAMS[@]}" + set +o xtrace -o nounset + + if _gpu_direct_rdma_enabled; then + echo "Loading NVIDIA Peer Memory kernel module..." + set -o xtrace +o nounset + modprobe -a nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + set +o xtrace -o nounset + fi + + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + echo "Copying gridd.conf..." + cp /drivers/gridd.conf /etc/nvidia/gridd.conf + if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then + echo "Copying ClientConfigToken..." + mkdir -p /etc/nvidia/ClientConfigToken/ + cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ + fi + + echo "Starting nvidia-gridd.." + LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd + + # Start virtual topology daemon + _start_vgpu_topology_daemon + fi + + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi +} + +# Stop persistenced and unload the kernel modules if they are currently loaded. +_unload_driver() { + local rmmod_args=() + local nvidia_deps=0 + local nvidia_refs=0 + local nvidia_uvm_refs=0 + local nvidia_modeset_refs=0 + local nvidia_peermem_refs=0 + + if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then + echo "Stopping NVIDIA persistence daemon..." + local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA persistence daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-gridd/nvidia-gridd.pid ]; then + echo "Stopping NVIDIA grid daemon..." + local pid=$(< /var/run/nvidia-gridd/nvidia-gridd.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA Grid daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then + echo "Stopping NVIDIA fabric manager daemon..." + local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA fabric manager daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then + echo "Stopping NVLink Subnet Manager daemon..." + local pid=$(< /var/run/nvidia-fabricmanager/nvlsm.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVLink Subnet Manager daemon" >&2 + return 1 + fi + fi + + echo "Unloading NVIDIA driver kernel modules..." + if [ -f /sys/module/nvidia_modeset/refcnt ]; then + nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) + rmmod_args+=("nvidia-modeset") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_uvm/refcnt ]; then + nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) + rmmod_args+=("nvidia-uvm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia/refcnt ]; then + nvidia_refs=$(< /sys/module/nvidia/refcnt) + rmmod_args+=("nvidia") + fi + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + nvidia_peermem_refs=$(< /sys/module/nvidia_peermem/refcnt) + rmmod_args+=("nvidia-peermem") + ((++nvidia_deps)) + fi + if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ] || [ ${nvidia_peermem_refs} -gt 0 ]; then + echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 + return 1 + fi + + if [ ${#rmmod_args[@]} -gt 0 ]; then + rmmod ${rmmod_args[@]} + fi + return 0 +} + +# Link and install the kernel modules from a precompiled package using the nvidia-installer. +_install_driver() { + local install_args=() + + echo "Installing NVIDIA driver kernel modules..." + cd /usr/src/nvidia-${DRIVER_VERSION} + rm -rf /lib/modules/${KERNEL_VERSION}/video + + if [ "${ACCEPT_LICENSE}" = "yes" ]; then + install_args+=("--accept-license") + fi + + # Specify the --skip-module-load flag for versions of the nvidia-installer that + # support it. From the nvidia-installer help output: + # + # --skip-module-load + # Skip the test load of the NVIDIA kernel modules after the modules are built, + # and skip loading them after installation is complete. + # + # Without this flag, a subtle bug can occur if the nvidia-installer fails to unload + # the NVIDIA kernel modules after the test load. The modules will remain loaded and + # any custom NVIDIA module parameters configured as input to the driver container + # will not be applied. + # + if [ "${DRIVER_BRANCH}" -ge "550" ]; then + install_args+=("--skip-module-load") + fi + + IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} + # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path + # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point + # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit + #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"} +} + +# Mount the driver rootfs into the run directory with the exception of sysfs. +_mount_rootfs() { + echo "Mounting NVIDIA driver rootfs..." + mount --make-runbindable /sys + mount --make-private /sys + mkdir -p ${RUN_DIR}/driver + mount --rbind / ${RUN_DIR}/driver + + echo "Check SELinux status" + if [ -e /sys/fs/selinux ]; then + echo "SELinux is enabled" + echo "Change device files security context for selinux compatibility" + chcon -R -t container_file_t ${RUN_DIR}/driver/dev + else + echo "SELinux is disabled, skipping..." + fi +} + +# Unmount the driver rootfs from the run directory. +_unmount_rootfs() { + echo "Unmounting NVIDIA driver rootfs..." + if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then + umount -l -R ${RUN_DIR}/driver + fi +} + +# Write a kernel postinst.d script to automatically precompile packages on kernel update (similar to DKMS). +_write_kernel_update_hook() { + if [ ! -d ${KERNEL_UPDATE_HOOK%/*} ]; then + return + fi + + echo "Writing kernel update hook..." + cat > ${KERNEL_UPDATE_HOOK} <<'EOF' +#!/bin/bash + +set -eu +trap 'echo "ERROR: Failed to update the NVIDIA driver" >&2; exit 0' ERR + +NVIDIA_DRIVER_PID=$(< /run/nvidia/nvidia-driver.pid) + +export "$(grep -z DRIVER_VERSION /proc/${NVIDIA_DRIVER_PID}/environ)" +nsenter -t "${NVIDIA_DRIVER_PID}" -m -- nvidia-driver update --kernel "$1" +EOF + chmod +x ${KERNEL_UPDATE_HOOK} +} + +_shutdown() { + if _unload_driver; then + _unmount_rootfs + rm -f ${PID_FILE} ${KERNEL_UPDATE_HOOK} + return 0 + fi + return 1 +} + +# _resolve_kernel_type determines which kernel module type, open or proprietary, to install. +# This function assumes that the nvidia-installer binary is in the PATH, so this function +# should only be invoked after the userspace driver components have been installed. +# +# KERNEL_MODULE_TYPE is the frontend interface that users can use to configure which module +# to install. Valid values for KERNEL_MODULE_TYPE are 'auto' (default), 'open', and 'proprietary'. +# When 'auto' is configured, we use the nvidia-installer to recommend the module type to install. +_resolve_kernel_type() { + if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then + KERNEL_TYPE=kernel + elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then + KERNEL_TYPE=kernel-open + elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then + kernel_module_type=$(nvidia-installer --print-recommended-kernel-module-type) + if [ $? -ne 0 ]; then + echo "failed to retrieve the recommended kernel module type from nvidia-installer, falling back to using the driver branch" + _resolve_kernel_type_from_driver_branch + return 0 + fi + [[ "${kernel_module_type}" == "open" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel + else + echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}" + return 1 + fi +} + +_resolve_kernel_type_from_driver_branch() { + [[ "${DRIVER_BRANCH}" -lt 560 ]] && KERNEL_TYPE=kernel || KERNEL_TYPE=kernel-open +} + +_find_vgpu_driver_version() { + local count="" + local version="" + local drivers_path="/drivers" + + if [ "${DISABLE_VGPU_VERSION_CHECK}" = "true" ]; then + echo "vgpu version compatibility check is disabled" + return 0 + fi + # check if vgpu devices are present + count=$(vgpu-util count) + if [ $? -ne 0 ]; then + echo "cannot find vgpu devices on host, pleae check /var/log/vgpu-util.log for more details..." + return 0 + fi + NUM_VGPU_DEVICES=$(echo "$count" | awk -F= '{print $2}') + if [ $NUM_VGPU_DEVICES -eq 0 ]; then + # no vgpu devices found, treat as passthrough + return 0 + fi + echo "found $NUM_VGPU_DEVICES vgpu devices on host" + + # find compatible guest driver using driver catalog + if [ -d "/mnt/shared-nvidia-driver-toolkit/drivers" ]; then + drivers_path="/mnt/shared-nvidia-driver-toolkit/drivers" + fi + version=$(vgpu-util match -i "${drivers_path}" -c "${drivers_path}/vgpuDriverCatalog.yaml") + if [ $? -ne 0 ]; then + echo "cannot find match for compatible vgpu driver from available list, please check /var/log/vgpu-util.log for more details..." + return 1 + fi + DRIVER_VERSION=$(echo "$version" | awk -F= '{print $2}') + echo "vgpu driver version selected: ${DRIVER_VERSION}" + return 0 +} + +_start_vgpu_topology_daemon() { + type nvidia-topologyd > /dev/null 2>&1 || return 0 + echo "Starting nvidia-topologyd.." + nvidia-topologyd +} + +_prepare() { + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + _find_vgpu_driver_version || exit 1 + fi + + # Install the userspace components and copy the kernel module sources. + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ + cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ + sh /tmp/install.sh nvinstall + + # Determine the kernel module type + _resolve_kernel_type || exit 1 + + # Copy the kernel module sources + mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest + + echo -e "\n========== NVIDIA Software Installer ==========\n" + echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" +} + +_prepare_exclusive() { + _prepare + + exec 3> ${PID_FILE} + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + _unload_driver || exit 1 + _unmount_rootfs +} + +_build() { + # Install dependencies + if _kernel_requires_package; then + _update_package_cache + _install_prerequisites + _create_driver_package + #_remove_prerequisites + _cleanup_package_cache + fi + + # Build the driver + _install_driver +} + +_load() { + _load_driver + _mount_rootfs + _write_kernel_update_hook + + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + +init() { + _prepare_exclusive + + _build + + _load +} + +build() { + _prepare + + _build +} + +load() { + _prepare_exclusive + + _load +} + +update() { + exec 3>&2 + if exec 2> /dev/null 4< ${PID_FILE}; then + if ! flock -n 4 && read pid <&4 && kill -0 "${pid}"; then + exec > >(tee -a "/proc/${pid}/fd/1") + exec 2> >(tee -a "/proc/${pid}/fd/2" >&3) + else + exec 2>&3 + fi + exec 4>&- + fi + exec 3>&- + + # vgpu driver version is chosen dynamically during runtime, so pre-compile modules for + # only non-vgpu driver types + if [ "${DRIVER_TYPE}" != "vgpu" ]; then + if [ ! -e /usr/src/nvidia-${DRIVER_VERSION}/mkprecompiled ]; then + # Install the userspace components + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ + cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ + sh /tmp/install.sh nvinstall + # Determine the kernel module type + _resolve_kernel_type || exit 1 + # Copy the kernel module sources + mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest + fi + fi + + echo -e "\n========== NVIDIA Software Updater ==========\n" + echo -e "Starting update of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + + _update_package_cache + _resolve_kernel_version || exit 1 + _install_prerequisites + if _kernel_requires_package; then + # ensure KERNEL_TYPE is set before compiling kernel modules + [[ -n "${KERNEL_TYPE}" ]] || _resolve_kernel_type || exit 1 + _create_driver_package + fi + _remove_prerequisites + _cleanup_package_cache + + echo "Done" + exit 0 +} + +# Wait for MOFED drivers to be loaded and load nvidia-peermem whenever it gets unloaded during MOFED driver updates +reload_nvidia_peermem() { + if [ "$USE_HOST_MOFED" = "true" ]; then + until lsmod | grep mlx5_core > /dev/null 2>&1 && [ -f /run/nvidia/validations/.driver-ctr-ready ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + else + # use driver readiness flag created by MOFED container + until [ -f /run/mellanox/drivers/.driver-ready ] && [ -f /run/nvidia/validations/.driver-ctr-ready ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + fi + # get any parameters provided for nvidia-peermem + _get_module_params && set +o nounset + if chroot /run/nvidia/driver modprobe nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}"; then + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + echo "successfully loaded nvidia-peermem module, now waiting for signal" + sleep inf + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + fi + fi + echo "failed to load nvidia-peermem module" + exit 1 +} + +# probe by gpu-operator for liveness/startup checks for nvidia-peermem module to be loaded when MOFED drivers are ready +probe_nvidia_peermem() { + if lsmod | grep mlx5_core > /dev/null 2>&1; then + if [ ! -f /sys/module/nvidia_peermem/refcnt ]; then + echo "nvidia-peermem module is not loaded" + return 1 + fi + else + echo "MOFED drivers are not ready, skipping probe to avoid container restarts..." + fi + return 0 +} + +usage() { + cat >&2 < /dev/null && pwd ) +source $SCRIPT_DIR/common.sh + +nv-ctr-run-with-dtk() { + set -x + + if [[ "${RHCOS_IMAGE_MISSING:-}" == "true" ]]; then + echo "WARNING: RHCOS '${RHCOS_VERSION:-}' imagetag missing, using entitlement-based fallback" + exec bash -x nvidia-driver init + fi + + if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then + cp -r \ + /tmp/install.sh \ + /usr/local/bin/ocp_dtk_entrypoint \ + /usr/local/bin/nvidia-driver \ + /usr/local/bin/common.sh \ + /usr/local/bin/extract-vmlinux \ + /drivers \ + "$DRIVER_TOOLKIT_SHARED_DIR/" + + if [[ -f "/usr/local/bin/vgpu-util" ]]; then + cp /usr/local/bin/vgpu-util "$DRIVER_TOOLKIT_SHARED_DIR/" + fi + + env | sed 's/=/="/' | sed 's/$/"/' > "$DRIVER_TOOLKIT_SHARED_DIR/env" + + touch "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" + fi + + set +x + while [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started" ]]; do + if [[ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_toolkit_broken" ]]; then + echo "WARNING: broken driver toolkit detected, using entitlement-based fallback" + exec bash -x nvidia-driver init + fi + echo "$(date) Waiting for openshift-driver-toolkit-ctr container to start ..." + sleep 15 + done + + echo "$(date) openshift-driver-toolkit-ctr started." + + while [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]]; do + echo "$(date) Waiting for openshift-driver-toolkit-ctr container to build the precompiled driver ..." + sleep 15 + done + set -x + + MODULES_SHARED=${DRIVER_TOOLKIT_SHARED_DIR}/modules/ + + # Copy the modules to their standard location + MODULES_LOCAL="/lib/modules/$(uname -r)" + mkdir -p "${MODULES_LOCAL}" + + cp -rv "${MODULES_SHARED}"/* "${MODULES_LOCAL}" + + # Tell SELinux to allow loading these files + find . -type f \ + \( -name "*.txt" -or -name "*.go" \) \ + -exec chcon -t modules_object_t "{}" \; + + echo "#" + echo "# Executing nvidia-driver load script ..." + echo "#" + + exec bash -x nvidia-driver load +} + +dtk-build-driver() { + if [[ "${RHCOS_IMAGE_MISSING:-}" == "true" ]]; then + echo "WARNING: 'istag/driver-toolkit:${RHCOS_VERSION} -n openshift' missing, nothing to do in openshift-driver-toolkit-ctr container" + sleep inf + fi + + if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then + echo "WARNING: broken Driver Toolkit image detected:" + echo "- Node kernel: $(uname -r)" + echo "- Kernel package: $(rpm -q --qf "%{VERSION}-%{RELEASE}.%{ARCH}" kernel-core)" + + echo "INFO: informing nvidia-driver-ctr to fallback on entitled-build." + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_toolkit_broken" + echo "INFO: nothing else to do in openshift-driver-toolkit-ctr container, sleeping forever." + sleep inf + fi + + # Shared directory is prepared before entering this script. See + # 'until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ] ...' + # in the Pod command/args + touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started" + + if [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; then + echo "NVIDIA drivers already generated, nothing to do ..." + + while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do + sleep 30 + done + echo "WARNING: driver_built flag disappeared, rebuilding the drivers ..." + else + echo "Start building nvidia.ko driver ..." + fi + + set -x + set -o allexport + source "${DRIVER_TOOLKIT_SHARED_DIR}/env" + set +o allexport; + + DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} + echo "DRIVER_ARCH is $DRIVER_ARCH" + + # If this directory already exists, + # NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run fails to run + # and doesn't create its files. This may happen when the + # container fails and restart its execution, leading to + # hard-to-understand "unrelated" errors in the following of the script execution + + rm -rf "${DRIVER_TOOLKIT_SHARED_DIR}/drivers/NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}"; + + # elfutils-libelf-devel.x86_64 is already install in the DTK and enough + sed 's/elfutils-libelf.x86_64//' -i "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver" + + # Install script assumes these directories can be deleted->recreated, + # but recreation doesn't happen in the DTK + sed 's|rm -rf /lib/modules/${KERNEL_VERSION}/video||' -i "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver" + sed 's|rm -rf /lib/modules/${KERNEL_VERSION}||' -i "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver" + + mkdir "${DRIVER_TOOLKIT_SHARED_DIR}/bin" -p + + cp -v \ + "$DRIVER_TOOLKIT_SHARED_DIR/nvidia-driver" \ + "$DRIVER_TOOLKIT_SHARED_DIR/common.sh" \ + "$DRIVER_TOOLKIT_SHARED_DIR/extract-vmlinux" \ + "${DRIVER_TOOLKIT_SHARED_DIR}/bin" + + if [[ -f "$DRIVER_TOOLKIT_SHARED_DIR/vgpu-util" ]]; then + cp -v "$DRIVER_TOOLKIT_SHARED_DIR/vgpu-util" "$DRIVER_TOOLKIT_SHARED_DIR/bin" + fi + + ln -s $(which true) ${DRIVER_TOOLKIT_SHARED_DIR}/bin/dnf --force + + export PATH="${DRIVER_TOOLKIT_SHARED_DIR}/bin:$PATH"; + + # Install.sh script is mandatory + cp "${DRIVER_TOOLKIT_SHARED_DIR}/install.sh" /tmp/ + + cd "${DRIVER_TOOLKIT_SHARED_DIR}/drivers"; + echo "#" + echo "# Executing nvidia-driver build script ..." + echo "#" + bash -x "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver" build --tag builtin + + echo "#" + echo "# nvidia-driver build script completed." + echo "#" + + drivers=$(ls /lib/modules/"$(uname -r)"/kernel/drivers/video/nvidia*.ko) + if ! ls ${drivers} 2>/dev/null; then + echo "FATAL: no NVIDIA driver generated ..." + exit 1 + fi + + if _gpu_direct_storage_enabled; then + echo "#" + echo "# Executing nvidia-fs driver build." + echo "#" + # The dkms package is not supplied or supported by Red Hat. + # DKMS packages for RHEL are available in the third-party EPEL (Extra Packages for Enterprise Linux) repository. + # see https://access.redhat.com/solutions/1132653 + dnf install https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm + dnf config-manager --enable epel + dnf install -y dkms redhat-lsb-core kmod binutils net-tools iputils libudev-devel libnl3-devel udev openssl-devel userspace-rcu libmount + dnf group install -y "Development Tools" + + # Make nvidia driver sources accessible for building nvidia-fs + mkdir -p /lib/modules/${KERNEL_VERSION}/updates + ln -s /run/nvidia/driver/lib/modules/${KERNEL_VERSION}/kernel/drivers/video/ /lib/modules/${KERNEL_VERSION}/updates/dkms + + # Install nvidia-fs + make -C $DRIVER_TOOLKIT_SHARED_DIR/gds-nvidia-fs/src + + echo "#" + echo "# nvidia-fs build script completed." + echo "#" + fi + + if _gdrcopy_enabled; then + echo "#" + echo "# Executing gdrcopy driver build." + echo "#" + # Make nvidia driver sources accessible for building nvidia-fs + nvidia_src_dir=$(find /usr/src/nvidia-* -name "nv-p2p.c" -print -quit | xargs dirname || echo "NVIDIA_DRIVER_MISSING" 2>/dev/null) + if [ "nvidia_src_dir" = "NVIDIA_DRIVER_MISSING" ]; then + echo "Failed to find NVIDIA driver source, exiting." + return 1 + fi + export NVIDIA_SRC_DIR=$nvidia_src_dir + + # Build gdrdrv kernel module + make -C $DRIVER_TOOLKIT_SHARED_DIR/gdrcopy driver + + echo "#" + echo "# gdrcopy build script completed." + echo "#" + fi + + MODULES_SHARED="${DRIVER_TOOLKIT_SHARED_DIR}/modules" + mkdir -p "${MODULES_SHARED}" + + # Prepare the list of modules required by NVIDIA + modprobe -a i2c_core ipmi_msghandler ipmi_devintf --show-depends > ${MODULES_SHARED}/insmod_nvidia + modprobe -a nvidia nvidia-uvm nvidia-modeset --show-depends >> ${MODULES_SHARED}/insmod_nvidia + if _gpu_direct_rdma_enabled; then + modprobe -a nvidia-peermem --show-depends >> ${MODULES_SHARED}/insmod_nvidia + fi + + set +x + + # Copy the modules to the shared directory + while read line; do + if [[ "$line" == "builtin "* ]]; then + #eg: line="builtin i2c_core" + continue + fi + # eg: line="insmod /lib/modules/4.18.0-305.10.2.el8_4.x86_64/kernel/drivers/gpu/drm/drm.ko.x" + modsrc=$(echo "${line}" | awk '{ print $2}') + moddir=$(dirname "$(echo "${modsrc}" | sed "s|/lib/modules/$(uname -r)/||")") + moddst="${MODULES_SHARED}/${moddir}" + mkdir -p "${moddst}" + cp -v "${modsrc}" "${moddst}" + done <<< $(cat "${MODULES_SHARED}/insmod_nvidia") + + # Copies modules location and dependency files + cp /lib/modules/$(uname -r)/modules.* "${MODULES_SHARED}" + + echo "NVIDIA drivers generated, inform nvidia-driver-ctr container about it and sleep forever." + touch "${DRIVER_TOOLKIT_SHARED_DIR}/driver_built" + + if _gpu_direct_storage_enabled; then + echo "NVIDIA-FS drivers generated, inform nvidia-fs-driver-ctr container about it and sleep forever." + touch "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia_fs_built" + fi + + if _gdrcopy_enabled; then + echo "gdrcopy driver built, inform nvidia-gdrcopy-ctr container about it and sleep forever." + touch "${DRIVER_TOOLKIT_SHARED_DIR}/gdrcopy_built" + fi + + while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do + sleep 30 + done + + echo "WARNING: driver_built flag disappeared, restart this container" + exit 0 +} + +usage() { + cat >&2 < /etc/dnf/vars/releasever \ + && dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save \ + && dnf -y install kmod binutils + +# Prevent modprobe from giving a WARNING about missing files +RUN rm -rf /lib/modules/${KERNEL_VERSION_NOARCH}.${BUILD_ARCH} \ + && mkdir -p /lib/modules/${KERNEL_VERSION_NOARCH}.${BUILD_ARCH}/proc \ + && touch /lib/modules/${KERNEL_VERSION_NOARCH}.${BUILD_ARCH}/modules.order \ + && touch /lib/modules/${KERNEL_VERSION_NOARCH}.${BUILD_ARCH}/modules.builtin \ + && depmod ${KERNEL_VERSION_NOARCH}.${BUILD_ARCH} + +# Copy the DRM module dependency from the builder +COPY --from=builder /lib/modules/${KERNEL_VERSION_NOARCH}.${BUILD_ARCH}/kernel/drivers/gpu/drm/drm.ko.xz /lib/modules/${KERNEL_VERSION_NOARCH}.${BUILD_ARCH}/kernel/drivers/gpu/drm/drm.ko.xz + +# Copy the built NVIDIA driver RPM from the builder +COPY --from=builder /home/builder/yum-packaging-precompiled-kmod/RPMS/${TARGET_ARCH}/*.rpm /rpms/ + +# Install the Driver modules +RUN dnf install -y /rpms/kmod-nvidia-*.rpm \ + && rm -rf /rpms + +# Copy the rhsm-register script to enable subscription-manager during build time +COPY --chmod=744 ./rhsm-register /usr/local/bin/rhsm-register + +RUN --mount=type=secret,id=RHSM_ORG,target=/run/secrets/RHSM_ORG \ + --mount=type=secret,id=RHSM_ACTIVATIONKEY,target=/run/secrets/RHSM_ACTIVATIONKEY \ + export DRIVER_STREAM=$(echo ${DRIVER_VERSION} | cut -d '.' -f 1) \ + && if [ "${DRIVER_STREAM_TYPE}" == "development" ] ; then \ + curl -sLOf ${BASE_URL}/${DRIVER_VERSION}/NVIDIA-Linux-${TARGET_ARCH}-${DRIVER_VERSION}.run \ + sh ./NVIDIA-Linux-${TARGET_ARCH}-${DRIVER_VERSION}.run \ + --silent \ + --accept-license \ + --no-kernel-modules \ + --no-nvidia-modprobe \ + --no-rebuild-initramfs \ + && rm NVIDIA-Linux-${TARGET_ARCH}-${DRIVER_VERSION}.run ; \ + else \ + CUDA_VERSION_ARRAY=(${CUDA_VERSION//./ }) && CUDA_DASHED_VERSION=${CUDA_VERSION_ARRAY[0]}-${CUDA_VERSION_ARRAY[1]} \ + && rm /etc/rhsm-host \ + && /usr/local/bin/rhsm-register \ + && dnf -y module enable nvidia-driver:${DRIVER_STREAM}-open/default \ + && dnf install -y \ + nvidia-driver-cuda-${DRIVER_VERSION} \ + nvidia-driver-libs-${DRIVER_VERSION} \ + libnvidia-ml-${DRIVER_VERSION} \ + cuda-compat-${CUDA_DASHED_VERSION} \ + cuda-cudart-${CUDA_DASHED_VERSION} \ + nvidia-persistenced-${DRIVER_VERSION} \ + && if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$TARGETARCH" != "arm64" ]; then \ + versionArray=(${DRIVER_VERSION//./ }); \ + DRIVER_BRANCH=${versionArray[0]}; \ + dnf install -y nvidia-fabric-manager-${DRIVER_VERSION} libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION} ; \ + fi \ + && subscription-manager unregister ; \ + fi + +RUN dnf clean all \ + && rm /usr/local/bin/rhsm-register + +LABEL io.k8s.display-name="NVIDIA Driver Container" +LABEL name="NVIDIA Driver Container" +LABEL vendor="NVIDIA" +LABEL version="${DRIVER_VERSION}" +LABEL release="${KERNEL_VERSION}-${OS_TAG}" +LABEL summary="Provision the NVIDIA driver through containers" +LABEL description="See summary" + +# Add NGC DL license from the CUDA image +RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE + +# Remove cuda repository to avoid GPG errors +RUN rm -f /etc/yum.repos.d/cuda.repo + +ENTRYPOINT ["nvidia-driver", "init"] diff --git a/rhel10/precompiled/LICENSE b/rhel10/precompiled/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/rhel10/precompiled/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/rhel10/precompiled/Makefile b/rhel10/precompiled/Makefile new file mode 100644 index 000000000..f826f22fa --- /dev/null +++ b/rhel10/precompiled/Makefile @@ -0,0 +1,81 @@ +RHEL_VERSION ?= 10.0 +RHEL_VERSION_MAJOR = $(shell echo "${RHEL_VERSION}" | awk -F. '{print $$1}') + +KERNEL_VERSION ?= '' +BUILD_ARCH ?= x86_64 +TARGET_ARCH ?= $(shell echo "${BUILD_ARCH}" | sed "s/+64k//") + +KERNEL_VERSION_NOARCH=$(shell echo "${KERNEL_VERSION}" | sed "s/\.${TARGET_ARCH}//") +KERNEL_VERSION_TAG = $(shell echo "${KERNEL_VERSION_NOARCH}.${BUILD_ARCH}" | sed "s/+/_/") +BASE_URL ?= https://us.download.nvidia.com/tesla + +CUDA_VERSION ?= 12.8.1 +CUDA_DIST = ubi${RHEL_VERSION_MAJOR} + +DRIVER_VERSION ?= '' +DRIVER_TYPE ?= passthrough +DRIVER_OPEN ?= false +DRIVER_STREAM_TYPE ?= '' + +CONTAINER_TOOL ?= docker +DOCKERFILE ?= Dockerfile + +IMAGE_REGISTRY ?= nvcr.io/ea-cnt/nv_only +ifeq ($(DRIVER_TYPE),vgpu) + IMAGE_NAME ?= vgpu-guest-driver +else + IMAGE_NAME ?= driver +endif + +BUILDER_USER ?= $(shell git config --get user.name) +BUILDER_EMAIL ?= $(shell git config --get user.email) + +# Red Hat subscription +RHSM_ORG_FILE ?= ${HOME}/.rhsm_org +RHSM_ACTIVATIONKEY_FILE ?= ${HOME}/.rhsm_activationkey + +.PHONY: image image-push rhsm-register + +# Build the image +image: rhsm-register + @echo "!=== Building image ${IMAGE_REGISTRY}/${IMAGE_NAME}:${DRIVER_VERSION}-${KERNEL_VERSION_TAG}-${OS_TAG} ===!" + @if [ "${CONTAINER_TOOL}" == "docker" ]; then \ + echo "!=== Work around accessing DTK image using a pull secret with Docker ===!" \ + && mkdir -p .tmp && cp ${PULL_SECRET_FILE} .tmp/config.json \ + && ${CONTAINER_TOOL} --config .tmp pull ${DRIVER_TOOLKIT_IMAGE} \ + && rm ./.tmp/config.json \ + && BUILD_CMD_ADD_ARGS=; \ + else \ + BUILD_CMD_ADD_ARGS="--authfile ${PULL_SECRET_FILE}"; \ + fi \ + && ${CONTAINER_TOOL} build \ + $${BUILD_CMD_ADD_ARGS} \ + --secret id=RHSM_ORG,src=${RHSM_ORG_FILE} \ + --secret id=RHSM_ACTIVATIONKEY,src=${RHSM_ACTIVATIONKEY_FILE} \ + --build-arg RHEL_VERSION=${RHEL_VERSION} \ + --build-arg RHEL_VERSION_MAJOR=${RHEL_VERSION_MAJOR} \ + --build-arg CUDA_VERSION=${CUDA_VERSION} \ + --build-arg CUDA_DIST=${CUDA_DIST} \ + --build-arg BUILD_ARCH=${BUILD_ARCH} \ + --build-arg TARGET_ARCH=${TARGET_ARCH} \ + --build-arg KERNEL_VERSION=${KERNEL_VERSION} \ + --build-arg KERNEL_VERSION_NOARCH=${KERNEL_VERSION_NOARCH} \ + --build-arg DRIVER_VERSION=${DRIVER_VERSION} \ + --build-arg DRIVER_EPOCH=${DRIVER_EPOCH} \ + --build-arg BUILDER_USER="${BUILDER_USER}" \ + --build-arg BUILDER_EMAIL=${BUILDER_EMAIL} \ + --build-arg DRIVER_TOOLKIT_IMAGE=${DRIVER_TOOLKIT_IMAGE} \ + --build-arg DRIVER_OPEN=${DRIVER_OPEN} \ + --build-arg DRIVER_TYPE=${DRIVER_TYPE} \ + --build-arg DRIVER_STREAM_TYPE=${DRIVER_STREAM_TYPE} \ + --build-arg BASE_URL=${BASE_URL} \ + --build-arg OS_TAG=${OS_TAG} \ + --tag ${IMAGE_REGISTRY}/${IMAGE_NAME}:${DRIVER_VERSION}-${KERNEL_VERSION_TAG}-${OS_TAG} \ + --progress=plain \ + --file ${DOCKERFILE} . + +image-push: + @echo "!=== Pushing image ===!" + ${CONTAINER_TOOL} push \ + ${IMAGE_REGISTRY}/${IMAGE_NAME}:${DRIVER_VERSION}-${KERNEL_VERSION_TAG}-${OS_TAG} + diff --git a/rhel10/precompiled/README.md b/rhel10/precompiled/README.md new file mode 100644 index 000000000..a7d84a5b3 --- /dev/null +++ b/rhel10/precompiled/README.md @@ -0,0 +1,189 @@ +# Precompiled NVIDIA GPU driver container image for RHEL 10.x (RHCOS 4.13+) + +The procedure is based on [building custom kmod packages](https://github.com/NVIDIA/yum-packaging-precompiled-kmod) to allow support for a wide range of kernel versions. + +**Prerequisites**: + +* A Red Hat account with access to Red Hat Hybrid Cloud Console and Red Hat Subscription Management (RHSM). +* A machine for each architecture that the image is built for. Cross-compilation is not supported. + +## Image build + +1. Create a [Red Hat Customer Portal Activation Key](https://access.redhat.com/articles/1378093) and note your Red Hat Subscription Management (RHSM) organization ID. These will be used to install packages during a build. Save the values to file, e.g., `$HOME/rhsm_org` and `$HOME/rhsm_activationkey`, and export the paths to these files. + + ``` + export RHSM_ORG_FILE=$HOME/rhsm_org + export RHSM_ACTIVATIONKEY_FILE=$HOME/rhsm_activationkey + ``` + +2. Download a [Red Hat OpenShift pull secret](https://access.redhat.com/documentation/en-us/openshift_cluster_manager/2023/html/managing_clusters/assembly-managing-clusters#downloading_and_updating_pull_secrets). + + Once you have downloaded the pull secret, put it in a + `pull-secret.txt` file in the home folder of the user + building the precompiled driver image and we export the + path to this file. + + ``` + export PULL_SECRET_FILE=$HOME/pull-secret.txt + ``` + +3. Find out the Driver Toolkit (DTK) image for your target Red Hat OpenShift version, e.g.: + + *The Driver Toolkit (DTK from now on) is a container image in the + OpenShift payload which is meant to be used as a base image on + which to build driver containers. The Driver Toolkit image contains + the kernel packages commonly required as dependencies to build or + install kernel modules as well as a few tools needed in driver + containers. The version of these packages will match the kernel + version running on the RHCOS nodes in the corresponding OpenShift + release.* -- [Driver Toolkit](https://github.com/openshift/driver-toolkit/) + + With that in mind, we can start defining some environment variables + and get the Driver Toolkit image for the version of OpenShift we + need to compile the drivers for. + + First, we define the version of OpenShift and the architecture. + + ***Note*** - Red Hat Enterprise Linux 9 provides a kernel compiled + with 64k page size for `aarch64` architecture. For these builds, + the version of the kernel is suffixed with `+64k`. Hence, we need + to differentiate the target architecture, which is `aarch64` and + the build kernel which is either empty or `+64k`. + + ``` + export OPENSHIFT_VERSION='4.15.0' + export BUILD_ARCH='aarch64+64k' + export TARGET_ARCH=$(echo "${BUILD_ARCH}" | sed 's/+64k//') + ``` + + We can now get the Driver Toolkit image for OpenShift. + + ``` + export DRIVER_TOOLKIT_IMAGE=$( \ + oc adm release info --image-for=driver-toolkit \ + quay.io/openshift-release-dev/ocp-release:${OPENSHIFT_VERSION}-${TARGET_ARCH} \ + ) + ``` + + Regarding the naming convention, the generating image tag needs to + contain `rhcos` and the minor version of OpenShift. We export that + as the `OS_TAG` environment variable. + + ``` + export OS_TAG=rhcos$(echo ${OPENSHIFT_VERSION} | awk -F. '{print $1"."$2}') + ``` + +4. Find out the RHEL and kernel version of the target OpenShift cluster. + + Driver Toolkit contains the `/etc/driver-toolkit-release.json` file + that exposes some information about the RHEL and kernel that Driver + Toolkit was built for. We can extract them with `podman run` and + `jq`. + + First, the RHEL version. + + ``` + export RHEL_VERSION=$(podman run --rm -it \ + --authfile ${PULL_SECRET_FILE} \ + ${DRIVER_TOOLKIT_IMAGE} \ + cat /etc/driver-toolkit-release.json \ + | jq -r '.RHEL_VERSION') + ``` + + Then, the kernel version. + + ``` + export KERNEL_VERSION=$(podman run --rm -it \ + --authfile ${PULL_SECRET_FILE} \ + ${DRIVER_TOOLKIT_IMAGE} \ + cat /etc/driver-toolkit-release.json \ + | jq -r '.KERNEL_VERSION') + ``` + +5. Set NVIDIA environment variables. + + ``` + export CUDA_VERSION=12.8.1 + export DRIVER_EPOCH=1 + export DRIVER_VERSION=570.133.20 + ``` + +6. [Optional] Use custom signing keys + + By default, the build process generates self-signed key and certificate, + because the spec file expects them during the build. It uses the + `x509-configuration.ini` file to set the OpenSSL configuration. However, + for Secure Boot, it is recommended to use signing keys that are trusted by + the machines, i.e. that are part of the authorized keys database. + + To pass custom signing key and certificate during the build, you can put + them in the current folder as `private_key.priv` for the private key and + `public_key.der` for the public certificate in DER format. The build process + will use them if they are present, and fallback to self-signed certificate + otherwise. + +7. [Optional] Build the vGPU guest driver + + To build the vGPU guest driver, set the `DRIVER_TYPE` environment + variable to `vgpu`. The default is `passthrough`. + +8. [Optional] Customize the builder info + + The default container management tool is Docker (`docker`). You can + override it to use Podman by setting the `CONTAINER_TOOL` environment + variable to `podman`. + + The default registry is `nvcr.io/ea-cnt` which is limited to NVIDIA. + You can override it to your own registry via the `IMAGE_REGISTRY` + environment variable. + + The default image name is `driver` for `passthrough` and + `vgpu-guest-driver` for vGPU. You can override is by setting the + `IMAGE_NAME` environment variable. + + You can also override `BUILDER_USER` and/or `BUILDER_EMAIL`. Otherwise, + your Git username and email will be used. + + See the [Makefile](Makefile) for all available variables. + +9. Build and push the image + + ``` + make image image-push + ``` + +## NVIDIA GPU operator + +In order to be used with the NVIDIA GPU Operator on Red Hat OpenShift, +the image tag must follow the format `${DRIVER_VERSION}-${KERNEL_VERSION}-${OS_TAG}`, +and the full name will look like +`quay.io/acme/nvidia-gpu-driver:550.54.14-5.14.0-284.54.1.el9_2.aarch64_64k-rhcos4.15`. + + +Define the `NVIDIADDriver` custom resource to make use of the pre-compiled driver image, e.g.: + +``` + spec: + usePrecompiled: true + repository: quay.io/acme + image: nvidia-gpu-driver + version: 550.127.05 +``` + +Define the `ClusterPolicy` resource to make use of the NVIDIADriver custom resource, e.g.: + +``` + driver: + enabled: true + useNvidiaDriverCRD: true + validator: + driver: + env: + - name: DISABLE_DEV_CHAR_SYMLINK_CREATION + value: "true" +``` + +Examples of full NVIDIADriver and ClusterPolicy custom resources are available in the +[nvdidiadriver.json](nvidiadriver.json) and [clusterpolicy.json](clusterpolicy.json) files. + +Find more information in the [Precompiled Driver Containers](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/precompiled-drivers.html) documentation. diff --git a/rhel10/precompiled/clusterpolicy.json b/rhel10/precompiled/clusterpolicy.json new file mode 100644 index 000000000..8a0d5c76b --- /dev/null +++ b/rhel10/precompiled/clusterpolicy.json @@ -0,0 +1,132 @@ +{ + "apiVersion": "nvidia.com/v1", + "kind": "ClusterPolicy", + "metadata": { + "name": "gpu-cluster-policy" + }, + "spec": { + "operator": { + "defaultRuntime": "crio", + "use_ocp_driver_toolkit": true, + "initContainer": {} + }, + "sandboxWorkloads": { + "enabled": false, + "defaultWorkload": "container" + }, + "driver": { + "enabled": true, + "useNvidiaDriverCRD": true, + "useOpenKernelModules": false, + "upgradePolicy": { + "autoUpgrade": true, + "drain": { + "deleteEmptyDir": false, + "enable": false, + "force": false, + "timeoutSeconds": 300 + }, + "maxParallelUpgrades": 1, + "maxUnavailable": "25%", + "podDeletion": { + "deleteEmptyDir": false, + "force": false, + "timeoutSeconds": 300 + }, + "waitForCompletion": { + "timeoutSeconds": 0 + } + }, + "repoConfig": { + "configMapName": "" + }, + "certConfig": { + "name": "" + }, + "licensingConfig": { + "nlsEnabled": true, + "configMapName": "" + }, + "virtualTopology": { + "config": "" + }, + "kernelModuleConfig": { + "name": "" + } + }, + "dcgmExporter": { + "enabled": true, + "config": { + "name": "" + }, + "serviceMonitor": { + "enabled": true + } + }, + "dcgm": { + "enabled": true + }, + "daemonsets": { + "updateStrategy": "RollingUpdate", + "rollingUpdate": { + "maxUnavailable": "1" + } + }, + "devicePlugin": { + "enabled": true, + "config": { + "name": "", + "default": "" + } + }, + "gfd": { + "enabled": true + }, + "migManager": { + "enabled": true + }, + "nodeStatusExporter": { + "enabled": true + }, + "mig": { + "strategy": "single" + }, + "toolkit": { + "enabled": true + }, + "validator": { + "driver": { + "env": [ + { + "name": "DISABLE_DEV_CHAR_SYMLINK_CREATION", + "value": "true" + } + ] + }, + "plugin": { + "env": [ + { + "name": "WITH_WORKLOAD", + "value": "false" + } + ] + } + }, + "vgpuManager": { + "enabled": false + }, + "vgpuDeviceManager": { + "enabled": true + }, + "sandboxDevicePlugin": { + "enabled": true + }, + "vfioManager": { + "enabled": true + }, + "gds": { + "enabled": false + } + } +} + diff --git a/rhel10/precompiled/common.sh b/rhel10/precompiled/common.sh new file mode 100755 index 000000000..83cf7d075 --- /dev/null +++ b/rhel10/precompiled/common.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. + +GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" + +_mellanox_devices_present() { + devices_found=0 + for dev in /sys/bus/pci/devices/*; do + read vendor < $dev/vendor + if [ "$vendor" = "0x15b3" ]; then + echo "Mellanox device found at $(basename $dev)" + return 0 + fi + done + echo "No Mellanox devices were found..." + return 1 +} + +_gpu_direct_rdma_enabled() { + if [ "${GPU_DIRECT_RDMA_ENABLED}" = "true" ]; then + # check if mellanox cards are present + if _mellanox_devices_present; then + return 0 + fi + fi + return 1 +} diff --git a/rhel10/precompiled/nvidia-driver b/rhel10/precompiled/nvidia-driver new file mode 100755 index 000000000..750eb180b --- /dev/null +++ b/rhel10/precompiled/nvidia-driver @@ -0,0 +1,512 @@ +#! /bin/bash -x +# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + +set -eu + +RUN_DIR=/run/nvidia +PID_FILE=${RUN_DIR}/${0##*/}.pid +DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"} +KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver +NUM_VGPU_DEVICES=0 +NVIDIA_MODULE_PARAMS=() +NVIDIA_UVM_MODULE_PARAMS=() +NVIDIA_MODESET_MODULE_PARAMS=() +NVIDIA_PEERMEM_MODULE_PARAMS=() +TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} +USE_HOST_MOFED="${USE_HOST_MOFED:-false}" +DNF_RELEASEVER=${DNF_RELEASEVER:-""} +RHEL_VERSION=${RHEL_VERSION:-""} +RHEL_MAJOR_VERSION=10 + + +DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} +echo "DRIVER_ARCH is $DRIVER_ARCH" + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source $SCRIPT_DIR/common.sh + +_update_package_cache() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + echo "Updating the package cache..." + if ! yum -q makecache; then + echo "FATAL: failed to reach RHEL package repositories. "\ + "Ensure that the cluster can access the proper networks." + exit 1 + fi + fi +} + +_cleanup_package_cache() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + echo "Cleaning up the package cache..." + rm -rf /var/cache/yum/* + fi +} + +_get_rhel_version_from_kernel() { + local rhel_version_underscore rhel_version_arr + rhel_version_underscore=$(echo "${KERNEL_VERSION}" | sed 's/.*el\([0-9]\+_[0-9]\+\).*/\1/g') + # For e.g. :- from the kernel version 4.18.0-513.9.1.el8_9, we expect to extract the string "8_9" + if [[ ! ${rhel_version_underscore} =~ ^[0-9]+_[0-9]+$ ]]; then + echo "Unable to resolve RHEL version from kernel version" >&2 + return 1 + fi + IFS='_' read -r -a rhel_version_arr <<< "$rhel_version_underscore" + if [[ ${#rhel_version_arr[@]} -ne 2 ]]; then + echo "Unable to resolve RHEL version from kernel version" >&2 + return 1 + fi + RHEL_VERSION="${rhel_version_arr[0]}.${rhel_version_arr[1]}" + echo "RHEL VERSION successfully resolved from kernel: ${RHEL_VERSION}" + return 0 +} + +_resolve_rhel_version() { + _get_rhel_version_from_kernel || RHEL_VERSION="${RHEL_MAJOR_VERSION}" + # set dnf release version as rhel version by default + if [[ -z "${DNF_RELEASEVER}" ]]; then + DNF_RELEASEVER="${RHEL_VERSION}" + fi + return 0 +} + +# Resolve the kernel version to the form major.minor.patch-revision. +_resolve_kernel_version() { + echo "Resolving Linux kernel version..." + local version=$(yum -q list available --showduplicates kernel-headers | + awk -v arch=$(uname -m) 'NR>1 {print $2"."arch}' | tac | grep -E -m1 "^${KERNEL_VERSION/latest/.*}") + + if [ -z "${version}" ]; then + echo "Could not resolve Linux kernel version" >&2 + return 1 + fi + KERNEL_VERSION="${version}" + echo "Proceeding with Linux kernel version ${KERNEL_VERSION}" + return 0 +} + +# Cleanup the prerequisites installed above. +_remove_prerequisites() { + true + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + dnf -q -y remove kernel-headers-${KERNEL_VERSION} kernel-devel-${KERNEL_VERSION} > /dev/null + # TODO remove module files not matching an existing driver package. + fi +} + +# Check if the kernel version requires a new precompiled driver packages. +_kernel_requires_package() { + local proc_mount_arg="" + + echo "Checking NVIDIA driver packages..." + + [[ ! -d /usr/src/nvidia-${DRIVER_VERSION}/kernel ]] && return 0 + cd /usr/src/nvidia-${DRIVER_VERSION}/kernel + + proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc" + for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do + is_match=$(../mkprecompiled --match ${pkg_name} ${proc_mount_arg}) + if [ "${is_match}" == "kernel interface matches." ]; then + echo "Found NVIDIA driver package ${pkg_name##*/}" + return 1 + fi + done + return 0 +} + +_assert_nvswitch_system() { + [ -d /proc/driver/nvidia-nvswitch/devices ] || return 1 + if [ -z "$(ls -A /proc/driver/nvidia-nvswitch/devices)" ]; then + return 1 + fi + return 0 +} + +# For each kernel module configuration file mounted into the container, +# parse the file contents and extract the custom module parameters that +# are to be passed as input to 'modprobe'. +# +# Assumptions: +# - Configuration files are named .conf (i.e. nvidia.conf, nvidia-uvm.conf). +# - Configuration files are mounted inside the container at /drivers. +# - Each line in the file contains at least one parameter, where parameters on the same line +# are space delimited. It is up to the user to properly format the file to ensure +# the correct set of parameters are passed to 'modprobe'. +_get_module_params() { + local base_path="/drivers" + # nvidia + if [ -f "${base_path}/nvidia.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia.conf" + echo "Module parameters provided for nvidia: ${NVIDIA_MODULE_PARAMS[@]}" + fi + # nvidia-uvm + if [ -f "${base_path}/nvidia-uvm.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_UVM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-uvm.conf" + echo "Module parameters provided for nvidia-uvm: ${NVIDIA_UVM_MODULE_PARAMS[@]}" + fi + # nvidia-modeset + if [ -f "${base_path}/nvidia-modeset.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODESET_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-modeset.conf" + echo "Module parameters provided for nvidia-modeset: ${NVIDIA_MODESET_MODULE_PARAMS[@]}" + fi + # nvidia-peermem + if [ -f "${base_path}/nvidia-peermem.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_PEERMEM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-peermem.conf" + echo "Module parameters provided for nvidia-peermem: ${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + fi +} + +# Load the kernel modules and start persistenced. +_load_driver() { + echo "Parsing kernel module parameters..." + _get_module_params + + local nv_fw_search_path="$RUN_DIR/driver/lib/firmware" + local set_fw_path="true" + local fw_path_config_file="/sys/module/firmware_class/parameters/path" + for param in "${NVIDIA_MODULE_PARAMS[@]}"; do + if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then + set_fw_path="false" + fi + done + + echo "Configuring the following firmware search path in '$fw_path_config_file': $nv_fw_search_path" + if [[ ! -z $(grep '[^[:space:]]' $fw_path_config_file) ]]; then + echo "WARNING: A search path is already configured in $fw_path_config_file" + echo " Retaining the current configuration" + else + echo -n "$nv_fw_search_path" > $fw_path_config_file || echo "WARNING: Failed to configure the firmware search path" + if [ -d "/opt/lib/firmware/nvidia/${DRIVER_VERSION}" ]; then + rm -rf $nv_fw_search_path/nvidia/${DRIVER_VERSION} + mkdir -p $nv_fw_search_path/nvidia/${DRIVER_VERSION} + cp /opt/lib/firmware/nvidia/${DRIVER_VERSION}/gsp_*.bin $nv_fw_search_path/nvidia/${DRIVER_VERSION} + fi + fi + + echo "Loading NVIDIA driver kernel modules..." + set -o xtrace +o nounset + modprobe nvidia "${NVIDIA_MODULE_PARAMS[@]}" + modprobe nvidia-uvm "${NVIDIA_UVM_MODULE_PARAMS[@]}" + modprobe nvidia-modeset "${NVIDIA_MODESET_MODULE_PARAMS[@]}" + set +o xtrace -o nounset + + if _gpu_direct_rdma_enabled; then + echo "Loading NVIDIA Peer Memory kernel module..." + set -o xtrace +o nounset + modprobe -a nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + set +o xtrace -o nounset + fi + + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + echo "Copying gridd.conf..." + cp /drivers/gridd.conf /etc/nvidia/gridd.conf + if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then + echo "Copying ClientConfigToken..." + mkdir -p /etc/nvidia/ClientConfigToken/ + cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ + fi + + echo "Starting nvidia-gridd.." + LD_LIBRARY_PATH=/usr/lib64/nvidia/gridd nvidia-gridd + + # Start virtual topology daemon + _start_vgpu_topology_daemon + fi + + if _assert_nvswitch_system; then + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi +} + +# Stop persistenced and unload the kernel modules if they are currently loaded. +_unload_driver() { + local rmmod_args=() + local nvidia_deps=0 + local nvidia_refs=0 + local nvidia_uvm_refs=0 + local nvidia_modeset_refs=0 + local nvidia_peermem_refs=0 + + echo "Stopping NVIDIA persistence daemon..." + if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then + local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA persistence daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-gridd/nvidia-gridd.pid ]; then + echo "Stopping NVIDIA grid daemon..." + local pid=$(< /var/run/nvidia-gridd/nvidia-gridd.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 10); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 10 ]; then + echo "Could not stop NVIDIA Grid daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then + echo "Stopping NVIDIA fabric manager daemon..." + local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA fabric manager daemon" >&2 + return 1 + fi + fi + + echo "Unloading NVIDIA driver kernel modules..." + if [ -f /sys/module/nvidia_modeset/refcnt ]; then + nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) + rmmod_args+=("nvidia-modeset") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_uvm/refcnt ]; then + nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) + rmmod_args+=("nvidia-uvm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia/refcnt ]; then + nvidia_refs=$(< /sys/module/nvidia/refcnt) + rmmod_args+=("nvidia") + fi + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + nvidia_peermem_refs=$(< /sys/module/nvidia_peermem/refcnt) + rmmod_args+=("nvidia-peermem") + ((++nvidia_deps)) + fi + if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ] || [ ${nvidia_peermem_refs} -gt 0 ]; then + echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 + return 1 + fi + + if [ ${#rmmod_args[@]} -gt 0 ]; then + rmmod ${rmmod_args[@]} + fi + return 0 +} + +# Mount the driver rootfs into the run directory with the exception of sysfs. +_mount_rootfs() { + echo "Mounting NVIDIA driver rootfs..." + mount --make-runbindable /sys + mount --make-private /sys + mkdir -p ${RUN_DIR}/driver + mount --rbind / ${RUN_DIR}/driver + + echo "Check SELinux status" + if [ -e /sys/fs/selinux ]; then + echo "SELinux is enabled" + echo "Change device files security context for selinux compatibility" + chcon -R -t container_file_t ${RUN_DIR}/driver/dev + else + echo "SELinux is disabled, skipping..." + fi +} + +# Unmount the driver rootfs from the run directory. +_unmount_rootfs() { + echo "Unmounting NVIDIA driver rootfs..." + if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then + umount -l -R ${RUN_DIR}/driver + fi +} + + + +_shutdown() { + if _unload_driver; then + _unmount_rootfs + rm -f ${PID_FILE} ${KERNEL_UPDATE_HOOK} + return 0 + fi + return 1 +} + +_load() { + _load_driver + _mount_rootfs + + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + +init() { + _load +} + +load() { + _load +} + +update() { + exec 3>&2 + if exec 2> /dev/null 4< ${PID_FILE}; then + if ! flock -n 4 && read pid <&4 && kill -0 "${pid}"; then + exec > >(tee -a "/proc/${pid}/fd/1") + exec 2> >(tee -a "/proc/${pid}/fd/2" >&3) + else + exec 2>&3 + fi + exec 4>&- + fi + exec 3>&- + + # vgpu driver version is choosen dynamically during runtime, so pre-compile modules for + # only non-vgpu driver types + if [ "${DRIVER_TYPE}" != "vgpu" ]; then + # Install the userspace components and copy the kernel module sources. + if [ ! -e /usr/src/nvidia-${DRIVER_VERSION}/mkprecompiled ]; then + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ + cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ + sh /tmp/install.sh nvinstall && \ + mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ + mv LICENSE mkprecompiled kernel /usr/src/nvidia-$DRIVER_VERSION && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest + fi + fi + + echo -e "\n========== NVIDIA Software Updater ==========\n" + echo -e "Starting update of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + + _update_package_cache + _resolve_kernel_version || exit 1 + _install_prerequisites + if _kernel_requires_package; then + _create_driver_package + fi + _remove_prerequisites + _cleanup_package_cache + + echo "Done" + exit 0 +} + +# Wait for MOFED drivers to be loaded and load nvidia-peermem whenever it gets unloaded during MOFED driver updates +reload_nvidia_peermem() { + if [ "$USE_HOST_MOFED" = "true" ]; then + until lsmod | grep mlx5_core > /dev/null 2>&1 && [ -f /run/nvidia/validations/.driver-ctr-ready ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + else + # use driver readiness flag created by MOFED container + until [ -f /run/mellanox/drivers/.driver-ready ] && [ -f /run/nvidia/validations/.driver-ctr-ready ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + fi + # get any parameters provided for nvidia-peermem + _get_module_params && set +o nounset + if chroot /run/nvidia/driver modprobe nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}"; then + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + echo "successfully loaded nvidia-peermem module, now waiting for signal" + sleep inf + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + fi + fi + echo "failed to load nvidia-peermem module" + exit 1 +} + +# probe by gpu-operator for liveness/startup checks for nvidia-peermem module to be loaded when MOFED drivers are ready +probe_nvidia_peermem() { + if lsmod | grep mlx5_core > /dev/null 2>&1; then + if [ ! -f /sys/module/nvidia_peermem/refcnt ]; then + echo "nvidia-peermem module is not loaded" + return 1 + fi + else + echo "MOFED drivers are not ready, skipping probe to avoid container restarts..." + fi + return 0 +} + +usage() { + cat >&2 <