Skip to content

Commit 7f1d75c

Browse files
Add DRIVER_VERSION and KERNEL_MODULE_TYPE env vars to k8s-driver-manager init container for ClusterPolicy controller
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent 92170b1 commit 7f1d75c

18 files changed

+121
-10
lines changed

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,15 @@ type DriverSpec struct {
499499
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:auto,urn:alm:descriptor:com.tectonic.ui:select:open,urn:alm:descriptor:com.tectonic.ui:select:proprietary"
500500
KernelModuleType string `json:"kernelModuleType,omitempty"`
501501

502+
// DriverType defines the type of NVIDIA driver to be deployed.
503+
// Accepted values are gpu, vgpu, and vgpu-host-manager.
504+
// +kubebuilder:validation:Enum=gpu;vgpu;vgpu-host-manager
505+
// +kubebuilder:validation:Optional
506+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Driver Type"
507+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.description="Driver Type"
508+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:gpu,urn:alm:descriptor:com.tectonic.ui:select:vgpu,urn:alm:descriptor:com.tectonic.ui:select:vgpu-host-manager"
509+
DriverType string `json:"driverType,omitempty"`
510+
502511
// Enabled indicates if deployment of NVIDIA Driver through operator is enabled
503512
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
504513
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA Driver deployment through GPU Operator"

controllers/object_controls.go

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,10 @@ const (
160160
OpenKernelModulesEnabledEnvName = "OPEN_KERNEL_MODULES_ENABLED"
161161
// KernelModuleTypeEnvName is the name of the driver-container envvar to set the desired kernel module type
162162
KernelModuleTypeEnvName = "KERNEL_MODULE_TYPE"
163+
// DriverTypeEnvName is the name of the driver-container envvar to set the driver type
164+
DriverTypeEnvName = "DRIVER_TYPE"
165+
// DriverVersionEnvName is the name of the envvar to set the desired driver version
166+
DriverVersionEnvName = "DRIVER_VERSION"
163167
// MPSRootEnvName is the name of the envvar for configuring the MPS root
164168
MPSRootEnvName = "MPS_ROOT"
165169
// DefaultMPSRoot is the default MPS root path on the host
@@ -1002,7 +1006,7 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C
10021006
}
10031007

10041008
// update driver-manager initContainer
1005-
err = transformDriverManagerInitContainer(obj, &config.Driver.Manager, config.Driver.GPUDirectRDMA)
1009+
err = transformDriverManagerInitContainer(obj, &config.Driver.Manager, config.Driver.GPUDirectRDMA, config.Driver.Version, config.Driver.KernelModuleType, config.Driver.DriverType)
10061010
if err != nil {
10071011
return err
10081012
}
@@ -1050,7 +1054,7 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C
10501054
// TransformVGPUManager transforms NVIDIA vGPU Manager daemonset with required config as per ClusterPolicy
10511055
func TransformVGPUManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
10521056
// update k8s-driver-manager initContainer
1053-
err := transformDriverManagerInitContainer(obj, &config.VGPUManager.DriverManager, nil)
1057+
err := transformDriverManagerInitContainer(obj, &config.VGPUManager.DriverManager, nil, config.VGPUManager.Version, "", "")
10541058
if err != nil {
10551059
return fmt.Errorf("failed to transform k8s-driver-manager initContainer for vGPU Manager: %v", err)
10561060
}
@@ -2009,7 +2013,7 @@ func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec
20092013
// TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy
20102014
func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
20112015
// update k8s-driver-manager initContainer
2012-
err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil)
2016+
err := transformDriverManagerInitContainer(obj, &config.VFIOManager.DriverManager, nil, config.VFIOManager.Version, "", "")
20132017
if err != nil {
20142018
return fmt.Errorf("failed to transform k8s-driver-manager initContainer for VFIO Manager: %v", err)
20152019
}
@@ -2741,7 +2745,7 @@ func transformConfigManagerSidecarContainer(obj *appsv1.DaemonSet, config *gpuv1
27412745
return nil
27422746
}
27432747

2744-
func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpec *gpuv1.DriverManagerSpec, rdmaSpec *gpuv1.GPUDirectRDMASpec) error {
2748+
func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpec *gpuv1.DriverManagerSpec, rdmaSpec *gpuv1.GPUDirectRDMASpec, driverVersion string, kernelModuleType string, driverType string) error {
27452749
container := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")
27462750

27472751
if container == nil {
@@ -2765,6 +2769,21 @@ func transformDriverManagerInitContainer(obj *appsv1.DaemonSet, driverManagerSpe
27652769
}
27662770
}
27672771

2772+
// set driver version for config change detection
2773+
if driverVersion != "" {
2774+
setContainerEnv(container, DriverVersionEnvName, driverVersion)
2775+
}
2776+
2777+
// set kernel module type for config change detection
2778+
if kernelModuleType != "" {
2779+
setContainerEnv(container, KernelModuleTypeEnvName, kernelModuleType)
2780+
}
2781+
2782+
// set driver type for config change detection
2783+
if driverType != "" {
2784+
setContainerEnv(container, DriverTypeEnvName, driverType)
2785+
}
2786+
27682787
// set/append environment variables for driver-manager initContainer
27692788
if len(driverManagerSpec.Env) > 0 {
27702789
for _, env := range driverManagerSpec.Env {
@@ -3424,6 +3443,10 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
34243443
}
34253444
}
34263445

3446+
if len(config.Driver.DriverType) > 0 {
3447+
setContainerEnv(driverContainer, DriverTypeEnvName, config.Driver.DriverType)
3448+
}
3449+
34273450
// set container probe timeouts
34283451
if config.Driver.StartupProbe != nil {
34293452
setContainerProbe(driverContainer, config.Driver.StartupProbe, Startup)

controllers/transforms_test.go

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1719,7 +1719,9 @@ func TestTransformVFIOManager(t *testing.T) {
17191719
Name: "k8s-driver-manager",
17201720
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v1.0.0",
17211721
ImagePullPolicy: corev1.PullIfNotPresent,
1722-
Env: mockEnvCore,
1722+
Env: append([]corev1.EnvVar{
1723+
{Name: DriverVersionEnvName, Value: "v1.0.0"},
1724+
}, mockEnvCore...),
17231725
}).
17241726
WithPullSecret(secret),
17251727
},
@@ -1937,10 +1939,12 @@ func newBoolPtr(b bool) *bool {
19371939

19381940
func TestTransformDriverManagerInitContainer(t *testing.T) {
19391941
testCases := []struct {
1940-
description string
1941-
ds Daemonset
1942-
cpSpec *gpuv1.ClusterPolicySpec
1943-
expectedDs Daemonset
1942+
description string
1943+
ds Daemonset
1944+
cpSpec *gpuv1.ClusterPolicySpec
1945+
driverVersion string
1946+
kernelModuleType string
1947+
expectedDs Daemonset
19441948
}{
19451949
{
19461950
description: "transform k8s-driver-manager initContainer",
@@ -1963,6 +1967,8 @@ func TestTransformDriverManagerInitContainer(t *testing.T) {
19631967
},
19641968
},
19651969
},
1970+
driverVersion: "",
1971+
kernelModuleType: "",
19661972
expectedDs: NewDaemonset().WithInitContainer(corev1.Container{
19671973
Name: "k8s-driver-manager",
19681974
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v1.0.0",
@@ -1974,11 +1980,39 @@ func TestTransformDriverManagerInitContainer(t *testing.T) {
19741980
},
19751981
}).WithInitContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret"),
19761982
},
1983+
{
1984+
description: "transform k8s-driver-manager initContainer with driver version and kernel module type",
1985+
ds: NewDaemonset().
1986+
WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}).
1987+
WithInitContainer(corev1.Container{Name: "dummy"}),
1988+
cpSpec: &gpuv1.ClusterPolicySpec{
1989+
Driver: gpuv1.DriverSpec{
1990+
Manager: gpuv1.DriverManagerSpec{
1991+
Repository: "nvcr.io/nvidia/cloud-native",
1992+
Image: "k8s-driver-manager",
1993+
Version: "v1.0.0",
1994+
ImagePullPolicy: "IfNotPresent",
1995+
ImagePullSecrets: []string{"pull-secret"},
1996+
},
1997+
},
1998+
},
1999+
driverVersion: "550.90.12",
2000+
kernelModuleType: "open",
2001+
expectedDs: NewDaemonset().WithInitContainer(corev1.Container{
2002+
Name: "k8s-driver-manager",
2003+
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v1.0.0",
2004+
ImagePullPolicy: corev1.PullIfNotPresent,
2005+
Env: []corev1.EnvVar{
2006+
{Name: DriverVersionEnvName, Value: "550.90.12"},
2007+
{Name: KernelModuleTypeEnvName, Value: "open"},
2008+
},
2009+
}).WithInitContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret"),
2010+
},
19772011
}
19782012

19792013
for _, tc := range testCases {
19802014
t.Run(tc.description, func(t *testing.T) {
1981-
err := transformDriverManagerInitContainer(tc.ds.DaemonSet, &tc.cpSpec.Driver.Manager, tc.cpSpec.Driver.GPUDirectRDMA)
2015+
err := transformDriverManagerInitContainer(tc.ds.DaemonSet, &tc.cpSpec.Driver.Manager, tc.cpSpec.Driver.GPUDirectRDMA, tc.driverVersion, tc.kernelModuleType, tc.cpSpec.Driver.DriverType)
19822016
require.NoError(t, err)
19832017
require.EqualValues(t, tc.expectedDs, tc.ds)
19842018
})
@@ -2665,6 +2699,9 @@ func TestTransformDriver(t *testing.T) {
26652699
}).WithInitContainer(corev1.Container{
26662700
Name: "k8s-driver-manager",
26672701
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
2702+
Env: []corev1.EnvVar{
2703+
{Name: DriverVersionEnvName, Value: "570.172.08"},
2704+
},
26682705
}),
26692706
errorExpected: false,
26702707
},
@@ -2962,6 +2999,9 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) {
29622999
Name: "k8s-driver-manager",
29633000
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
29643001
ImagePullPolicy: corev1.PullIfNotPresent,
3002+
Env: []corev1.EnvVar{
3003+
{Name: DriverVersionEnvName, Value: "570.172.08"},
3004+
},
29653005
}).WithVolume(corev1.Volume{
29663006
Name: "licensing-config",
29673007
VolumeSource: corev1.VolumeSource{
@@ -3016,6 +3056,9 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) {
30163056
Name: "k8s-driver-manager",
30173057
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
30183058
ImagePullPolicy: corev1.PullIfNotPresent,
3059+
Env: []corev1.EnvVar{
3060+
{Name: DriverVersionEnvName, Value: "570.172.08"},
3061+
},
30193062
}).WithVolume(corev1.Volume{
30203063
Name: "licensing-config",
30213064
VolumeSource: corev1.VolumeSource{
@@ -3140,6 +3183,9 @@ func TestTransformDriverWithResources(t *testing.T) {
31403183
}).WithInitContainer(corev1.Container{
31413184
Name: "k8s-driver-manager",
31423185
Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0",
3186+
Env: []corev1.EnvVar{
3187+
{Name: DriverVersionEnvName, Value: "570.172.08"},
3188+
},
31433189
}),
31443190
errorExpected: false,
31453191
},
@@ -3219,6 +3265,10 @@ func TestTransformDriverRDMA(t *testing.T) {
32193265
Name: "USE_HOST_MOFED",
32203266
Value: "true",
32213267
},
3268+
{
3269+
Name: "DRIVER_VERSION",
3270+
Value: "570.172.08",
3271+
},
32223272
},
32233273
}).WithContainer(corev1.Container{
32243274
Name: "nvidia-peermem",

internal/state/driver_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,7 @@ func getMinimalDriverRenderData() *driverRenderData {
627627
LivenessProbe: getDefaultContainerProbeSpec(),
628628
ReadinessProbe: getDefaultContainerProbeSpec(),
629629
DriverType: nvidiav1alpha1.GPU,
630+
Version: "525.85.03",
630631
Resources: &nvidiav1alpha1.ResourceRequirements{
631632
Requests: corev1.ResourceList{
632633
corev1.ResourceCPU: resource.MustParse("200m"),

internal/state/testdata/golden/driver-additional-configs.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,8 @@ spec:
241241
valueFrom:
242242
fieldRef:
243243
fieldPath: metadata.namespace
244+
- name: DRIVER_VERSION
245+
value: 525.85.03
244246
- name: DRIVER_TYPE
245247
value: gpu
246248
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel

internal/state/testdata/golden/driver-gdrcopy-openshift.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,8 @@ spec:
410410
valueFrom:
411411
fieldRef:
412412
fieldPath: metadata.namespace
413+
- name: DRIVER_VERSION
414+
value: 525.85.03
413415
- name: DRIVER_TYPE
414416
value: gpu
415417
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel

internal/state/testdata/golden/driver-gdrcopy.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ spec:
293293
valueFrom:
294294
fieldRef:
295295
fieldPath: metadata.namespace
296+
- name: DRIVER_VERSION
297+
value: 525.85.03
296298
- name: DRIVER_TYPE
297299
value: gpu
298300
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel

internal/state/testdata/golden/driver-gds.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ spec:
293293
valueFrom:
294294
fieldRef:
295295
fieldPath: metadata.namespace
296+
- name: DRIVER_VERSION
297+
value: 525.85.03
296298
- name: DRIVER_TYPE
297299
value: gpu
298300
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel

internal/state/testdata/golden/driver-minimal.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,8 @@ spec:
232232
valueFrom:
233233
fieldRef:
234234
fieldPath: metadata.namespace
235+
- name: DRIVER_VERSION
236+
value: 525.85.03
235237
- name: DRIVER_TYPE
236238
value: gpu
237239
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel

internal/state/testdata/golden/driver-openshift-drivertoolkit.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,8 @@ spec:
348348
valueFrom:
349349
fieldRef:
350350
fieldPath: metadata.namespace
351+
- name: DRIVER_VERSION
352+
value: 525.85.03
351353
- name: DRIVER_TYPE
352354
value: gpu
353355
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel

0 commit comments

Comments
 (0)