Skip to content
This repository was archived by the owner on Jan 11, 2023. It is now read-only.

Commit 1b07ace

Browse files
committed
ensure N series clusters get aks-docker-engine (#4221)
1 parent 546e4c6 commit 1b07ace

File tree

14 files changed

+370
-78
lines changed

14 files changed

+370
-78
lines changed

pkg/acsengine/engine.go

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -363,44 +363,6 @@ func getDCOSDefaultRepositoryURL(orchestratorType string, orchestratorVersion st
363363
return ""
364364
}
365365

366-
func isNSeriesSKU(profile *api.AgentPoolProfile) bool {
367-
/* If a new GPU sku becomes available, add a key to this map, but only if you have a confirmation
368-
that we have an agreement with NVIDIA for this specific gpu.
369-
*/
370-
dm := map[string]bool{
371-
// K80
372-
"Standard_NC6": true,
373-
"Standard_NC12": true,
374-
"Standard_NC24": true,
375-
"Standard_NC24r": true,
376-
// M60
377-
"Standard_NV6": true,
378-
"Standard_NV12": true,
379-
"Standard_NV24": true,
380-
"Standard_NV24r": true,
381-
// P40
382-
"Standard_ND6s": true,
383-
"Standard_ND12s": true,
384-
"Standard_ND24s": true,
385-
"Standard_ND24rs": true,
386-
// P100
387-
"Standard_NC6s_v2": true,
388-
"Standard_NC12s_v2": true,
389-
"Standard_NC24s_v2": true,
390-
"Standard_NC24rs_v2": true,
391-
// V100
392-
"Standard_NC6s_v3": true,
393-
"Standard_NC12s_v3": true,
394-
"Standard_NC24s_v3": true,
395-
"Standard_NC24rs_v3": true,
396-
}
397-
if _, ok := dm[profile.VMSize]; ok {
398-
return dm[profile.VMSize]
399-
}
400-
401-
return false
402-
}
403-
404366
func getDCOSCustomDataPublicIPStr(orchestratorType string, masterCount int) string {
405367
if orchestratorType == api.DCOS {
406368
var buf bytes.Buffer

pkg/acsengine/engine_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212

1313
"github.com/Azure/acs-engine/pkg/acsengine/transform"
1414
"github.com/Azure/acs-engine/pkg/api"
15+
"github.com/Azure/acs-engine/pkg/api/common"
1516
"github.com/Azure/acs-engine/pkg/api/v20160330"
1617
"github.com/Azure/acs-engine/pkg/api/vlabs"
1718
"github.com/Azure/acs-engine/pkg/i18n"
@@ -497,14 +498,14 @@ func TestIsNSeriesSKU(t *testing.T) {
497498
}
498499

499500
for _, sku := range validSkus {
500-
if !isNSeriesSKU(&api.AgentPoolProfile{VMSize: sku}) {
501-
t.Fatalf("Expected isNSeriesSKU(%s) to be true", sku)
501+
if !common.IsNvidiaEnabledSKU(sku) {
502+
t.Fatalf("Expected common.IsNvidiaEnabledSKU(%s) to be true", sku)
502503
}
503504
}
504505

505506
for _, sku := range invalidSkus {
506-
if isNSeriesSKU(&api.AgentPoolProfile{VMSize: sku}) {
507-
t.Fatalf("Expected isNSeriesSKU(%s) to be false", sku)
507+
if common.IsNvidiaEnabledSKU(sku) {
508+
t.Fatalf("Expected common.IsNvidiaEnabledSKU(%s) to be false", sku)
508509
}
509510
}
510511
}

pkg/acsengine/template_generator.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ func (t *TemplateGenerator) getTemplateFuncMap(cs *api.ContainerService) templat
232232
storagetier, _ := getStorageAccountType(profile.VMSize)
233233
buf.WriteString(fmt.Sprintf(",storageprofile=managed,storagetier=%s", storagetier))
234234
}
235-
if isNSeriesSKU(profile) {
235+
if common.IsNvidiaEnabledSKU(profile.VMSize) {
236236
accelerator := "nvidia"
237237
buf.WriteString(fmt.Sprintf(",accelerator=%s", accelerator))
238238
}
@@ -786,7 +786,7 @@ func (t *TemplateGenerator) getTemplateFuncMap(cs *api.ContainerService) templat
786786
return cs.Properties.IsNVIDIADevicePluginEnabled()
787787
},
788788
"IsNSeriesSKU": func(profile *api.AgentPoolProfile) bool {
789-
return isNSeriesSKU(profile)
789+
return common.IsNvidiaEnabledSKU(profile.VMSize)
790790
},
791791
"UseSinglePlacementGroup": func(profile *api.AgentPoolProfile) bool {
792792
return *profile.SinglePlacementGroup

pkg/api/addons.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ func (cs *ContainerService) setAddonsConfig() {
146146

147147
defaultNVIDIADevicePluginAddonsConfig := KubernetesAddon{
148148
Name: NVIDIADevicePluginAddonName,
149-
Enabled: helpers.PointerToBool(IsNSeriesSKU(cs.Properties) && common.IsKubernetesVersionGe(o.OrchestratorVersion, "1.10.0")),
149+
Enabled: helpers.PointerToBool(cs.Properties.HasNSeriesSKU() && common.IsKubernetesVersionGe(o.OrchestratorVersion, "1.10.0")),
150150
Containers: []KubernetesContainerSpec{
151151
{
152152
Name: NVIDIADevicePluginAddonName,

pkg/api/common/helper.go

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,152 @@ func ValidateDNSPrefix(dnsName string) error {
6565
}
6666
return nil
6767
}
68+
69+
// IsNvidiaEnabledSKU determines if an VM SKU has nvidia driver support
70+
func IsNvidiaEnabledSKU(vmSize string) bool {
71+
/* If a new GPU sku becomes available, add a key to this map, but only if you have a confirmation
72+
that we have an agreement with NVIDIA for this specific gpu.
73+
*/
74+
dm := map[string]bool{
75+
// K80
76+
"Standard_NC6": true,
77+
"Standard_NC12": true,
78+
"Standard_NC24": true,
79+
"Standard_NC24r": true,
80+
// M60
81+
"Standard_NV6": true,
82+
"Standard_NV12": true,
83+
"Standard_NV24": true,
84+
"Standard_NV24r": true,
85+
// P40
86+
"Standard_ND6s": true,
87+
"Standard_ND12s": true,
88+
"Standard_ND24s": true,
89+
"Standard_ND24rs": true,
90+
// P100
91+
"Standard_NC6s_v2": true,
92+
"Standard_NC12s_v2": true,
93+
"Standard_NC24s_v2": true,
94+
"Standard_NC24rs_v2": true,
95+
// V100
96+
"Standard_NC6s_v3": true,
97+
"Standard_NC12s_v3": true,
98+
"Standard_NC24s_v3": true,
99+
"Standard_NC24rs_v3": true,
100+
}
101+
if _, ok := dm[vmSize]; ok {
102+
return dm[vmSize]
103+
}
104+
105+
return false
106+
}
107+
108+
// GetNSeriesVMCasesForTesting returns a struct w/ VM SKUs and whether or not we expect them to be nvidia-enabled
109+
func GetNSeriesVMCasesForTesting() []struct {
110+
VMSKU string
111+
Expected bool
112+
} {
113+
cases := []struct {
114+
VMSKU string
115+
Expected bool
116+
}{
117+
{
118+
"Standard_NC6",
119+
true,
120+
},
121+
{
122+
"Standard_NC12",
123+
true,
124+
},
125+
{
126+
"Standard_NC24",
127+
true,
128+
},
129+
{
130+
"Standard_NC24r",
131+
true,
132+
},
133+
{
134+
"Standard_NV6",
135+
true,
136+
},
137+
{
138+
"Standard_NV12",
139+
true,
140+
},
141+
{
142+
"Standard_NV24",
143+
true,
144+
},
145+
{
146+
"Standard_NV24r",
147+
true,
148+
},
149+
{
150+
"Standard_ND6s",
151+
true,
152+
},
153+
{
154+
"Standard_ND12s",
155+
true,
156+
},
157+
{
158+
"Standard_ND24s",
159+
true,
160+
},
161+
{
162+
"Standard_ND24rs",
163+
true,
164+
},
165+
{
166+
"Standard_NC6s_v2",
167+
true,
168+
},
169+
{
170+
"Standard_NC12s_v2",
171+
true,
172+
},
173+
{
174+
"Standard_NC24s_v2",
175+
true,
176+
},
177+
{
178+
"Standard_NC24rs_v2",
179+
true,
180+
},
181+
{
182+
"Standard_NC24rs_v2",
183+
true,
184+
},
185+
{
186+
"Standard_NC6s_v3",
187+
true,
188+
},
189+
{
190+
"Standard_NC12s_v3",
191+
true,
192+
},
193+
{
194+
"Standard_NC24s_v3",
195+
true,
196+
},
197+
{
198+
"Standard_NC24rs_v3",
199+
true,
200+
},
201+
{
202+
"Standard_D2_v2",
203+
false,
204+
},
205+
{
206+
"gobledygook",
207+
false,
208+
},
209+
{
210+
"",
211+
false,
212+
},
213+
}
214+
215+
return cases
216+
}

pkg/api/common/helper_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,14 @@ func TestValidateDNSPrefix(t *testing.T) {
5656
}
5757
}
5858
}
59+
60+
func TestIsNvidiaEnabledSKU(t *testing.T) {
61+
cases := GetNSeriesVMCasesForTesting()
62+
63+
for _, c := range cases {
64+
ret := IsNvidiaEnabledSKU(c.VMSKU)
65+
if ret != c.Expected {
66+
t.Fatalf("expected IsNvidiaEnabledSKU(%s) to return %t, but instead got %t", c.VMSKU, c.Expected, ret)
67+
}
68+
}
69+
}

pkg/api/defaults.go

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -433,19 +433,29 @@ func (p *Properties) setAgentProfileDefaults(isUpgrade, isScale bool) {
433433
profile.AcceleratedNetworkingEnabledWindows = helpers.PointerToBool(DefaultAcceleratedNetworkingWindowsEnabled)
434434
}
435435

436-
if profile.Distro == "" && profile.OSType != Windows {
437-
if p.OrchestratorProfile.IsKubernetes() {
438-
if profile.OSDiskSizeGB != 0 && profile.OSDiskSizeGB < VHDDiskSizeAKS {
439-
profile.Distro = Ubuntu
440-
} else {
441-
if IsNSeriesSKU(p) {
442-
profile.Distro = AKSDockerEngine
436+
if profile.OSType != Windows {
437+
if profile.Distro == "" {
438+
if p.OrchestratorProfile.IsKubernetes() {
439+
if profile.OSDiskSizeGB != 0 && profile.OSDiskSizeGB < VHDDiskSizeAKS {
440+
profile.Distro = Ubuntu
443441
} else {
444-
profile.Distro = AKS
442+
if profile.IsNSeriesSKU() {
443+
profile.Distro = AKSDockerEngine
444+
} else {
445+
profile.Distro = AKS
446+
}
445447
}
448+
} else if !p.OrchestratorProfile.IsOpenShift() {
449+
profile.Distro = Ubuntu
450+
}
451+
// Ensure distro is set properly for N Series SKUs, because
452+
// (1) At present, "aks-docker-engine" and "ubuntu" are the only working distro base for running GPU workloads on N Series SKUs
453+
// (2) Previous versions of acs-engine had working implementations using the "aks" distro value,
454+
// so we need to hard override it in order to produce a working cluster in upgrade/scale contexts
455+
} else if p.OrchestratorProfile.IsKubernetes() && (isUpgrade || isScale) && profile.IsNSeriesSKU() {
456+
if profile.Distro == AKS {
457+
profile.Distro = AKSDockerEngine
446458
}
447-
} else if !p.OrchestratorProfile.IsOpenShift() {
448-
profile.Distro = Ubuntu
449459
}
450460
}
451461

0 commit comments

Comments
 (0)