Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/cases/nvidia/manifests/job-hpc-benchmarks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ spec:
resources:
limits:
nvidia.com/gpu: {{.GpuPerNode}}
vpc.amazonaws.com/efa: {{.EfaPerNode}}
env:
- name: UCX_TLS
value: "^sysv"
Expand Down
5 changes: 4 additions & 1 deletion test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type unitTestManifestTplVars struct {

type hpcTestManifestTplVars struct {
GpuPerNode int
EfaPerNode int
}

func TestSingleNodeUnitTest(t *testing.T) {
Expand Down Expand Up @@ -98,6 +99,7 @@ func TestSingleNodeUnitTest(t *testing.T) {
var err error
renderedJobHpcBenchmarksSingleNodeManifest, err = fwext.RenderManifests(jobHpcBenchmarksSingleNodeManifest, hpcTestManifestTplVars{
GpuPerNode: gpuPerNode,
EfaPerNode: efaPerNode,
})
if err != nil {
t.Fatal(err)
Expand All @@ -113,7 +115,8 @@ func TestSingleNodeUnitTest(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "hpc-benckmarks-job", Namespace: "default"},
}
err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithContext(ctx))
wait.WithContext(ctx),
wait.WithTimeout(20*time.Minute))
if err != nil {
t.Fatal(err)
}
Expand Down
4 changes: 2 additions & 2 deletions test/images/nvidia-inference/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
###############################################################################
# Base image, arguments, and environment
###############################################################################
ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=8
ARG CUDA_MAJOR_VERSION=13
ARG CUDA_MINOR_VERSION=0

FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04

Expand Down
12 changes: 6 additions & 6 deletions test/images/nvidia-training/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=8
ARG CUDA_MAJOR_VERSION=13
ARG CUDA_MINOR_VERSION=0

# Use the NVIDIA CUDA runtime as a parent image
FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04
Expand Down Expand Up @@ -65,15 +65,15 @@ RUN ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& pip --no-cache-dir install --upgrade pip setuptools

# Install Pytorch from Source
ARG PYTORCH_BRANCH=v2.6.0
ARG PYTORCH_BRANCH=v2.9.0
ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0"

ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
ENV PATH=$PATH:$CUDA_HOME/bin
ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.7;8.9;9.0;10.0;12.0"

RUN pip install typing-extensions sympy pyyaml
RUN pip install typing-extensions sympy pyyaml cmake
RUN git clone https://github.com/pytorch/pytorch.git /tmp/pytorch \
--recursive \
--branch $PYTORCH_BRANCH \
Expand Down Expand Up @@ -111,15 +111,15 @@ RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLE
&& cd && rm -rf /tmp/aws-efa-installer

# Install NCCL
ARG LIBNCCL_VERSION=2.27.7-1
ARG LIBNCCL_VERSION=2.28.3-1
RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp/nccl \
&& cd /tmp/nccl \
&& make -j $(nproc) \
&& make install \
&& cd && rm -rf /tmp/nccl

# Install AWS-OFI-NCCL plugin
ARG AWS_OFI_NCCL_VERSION=1.16.3
ARG AWS_OFI_NCCL_VERSION=1.17.1
RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz -C /tmp \
&& cd /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \
&& ./configure \
Expand Down
84 changes: 84 additions & 0 deletions test/images/nvidia-training/Dockerfilesimple
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
FROM nvcr.io/nvidia/pytorch:25.10-py3

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive

# Install OpenSSH for MPI Operator
RUN apt-get update && \
apt-get install -y --no-install-recommends \
openssh-server openssh-client && \
rm -rf /var/lib/apt/lists/*

# Configure SSH for MPI
RUN mkdir -p /var/run/sshd && \
sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

# Install EFA drivers and libraries for optimal network performance
# Note: EFA installer will place OpenMPI at /opt/amazon/openmpi
ARG EFA_INSTALLER_VERSION=latest
RUN apt-get update && \
apt-get install -y --no-install-recommends wget curl libhwloc-dev && \
curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLER_VERSION.tar.gz | tar xvz -C /tmp && \
cd /tmp/aws-efa-installer && \
./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \
cd && rm -rf /tmp/aws-efa-installer && \
rm -rf /var/lib/apt/lists/*

# Install AWS-OFI-NCCL plugin for EFA optimization
ARG AWS_OFI_NCCL_VERSION=1.17.1
RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz -C /tmp && \
cd /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION && \
./configure \
--prefix=/opt/aws-ofi-nccl/install \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
--disable-tests && \
make -j $(nproc) && \
make install && \
cd && rm -rf /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION

# Remove HPCX paths from environment to avoid conflicts with EFA OpenMPI
RUN sed -i '/hpcx/d' /etc/environment || true && \
sed -i '/hpcx/d' ~/.bashrc || true

# Update library paths for EFA and AWS-OFI-NCCL
# Place EFA paths FIRST to override any NVIDIA container defaults
ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/lib
ENV OPAL_PREFIX=/opt/amazon/openmpi
ENV MPI_ROOT=/opt/amazon/openmpi
ENV NCCL_PROTO=simple

ENV FI_PROVIDER=efa
ENV FI_EFA_USE_DEVICE_RDMA=1
ENV NCCL_DEBUG=INFO
ENV NCCL_SOCKET_IFNAME=^docker0,lo
ENV FI_EFA_FORK_SAFE=1

ENV LD_PRELOAD=/opt/aws-ofi-nccl/install/lib/libnccl-net.so

RUN ldconfig

# Set working directory
WORKDIR /app

# Copy training script and requirements
COPY train.py /app/
COPY requirements.txt /app/

# Install additional Python dependencies
RUN pip install --no-cache-dir -r requirements.txt

# The base image already includes:
# - PyTorch with CUDA support
# - NCCL for multi-GPU communication
# - OpenMPI for distributed training
# - EFA support
# - All necessary CUDA libraries

# Default command (can be overridden)
# CMD ["python", "train.py"]
8 changes: 4 additions & 4 deletions test/images/nvidia/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=8
ARG CUDA_MAJOR_VERSION=13
ARG CUDA_MINOR_VERSION=0

# Start with the NVIDIA CUDA base image
FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04
Expand Down Expand Up @@ -72,15 +72,15 @@ RUN git clone https://github.com/NVIDIA/nvbandwidth.git --branch $NVBANDWIDTH_VE
&& cd && rm -rf /tmp/cuda-samples

# Install NCCL
ARG LIBNCCL_VERSION=2.27.7-1
ARG LIBNCCL_VERSION=2.28.3-1
RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp/nccl \
&& cd /tmp/nccl \
&& make -j $(nproc) \
&& make install \
&& cd && rm -rf /tmp/nccl

# Install AWS-OFI-NCCL plugin
ARG AWS_OFI_NCCL_VERSION=1.16.3
ARG AWS_OFI_NCCL_VERSION=1.17.1
RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz -C /tmp \
&& cd /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \
&& ./configure \
Expand Down