Skip to content

Commit 05e0da5

Browse files
authored
Fix the AZs when creating subnets (#481)
* Verify GPU Direct RDMA is used on supported instance. * Fix the AZs when creating subnets.
1 parent 7483b1b commit 05e0da5

File tree

2 files changed

+71
-17
lines changed

2 files changed

+71
-17
lines changed

kubetest2/internal/deployers/eksapi/infra.go

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"errors"
77
"fmt"
88
"path"
9+
"slices"
910
"strings"
1011
"time"
1112

@@ -84,6 +85,31 @@ func (m *InfrastructureManager) createInfrastructureStack(opts *deployerOptions)
8485
if err != nil {
8586
return nil, err
8687
}
88+
// get two AZs for the subnets
89+
azs, err := m.clients.EC2().DescribeAvailabilityZones(context.TODO(), &ec2.DescribeAvailabilityZonesInput{})
90+
if err != nil {
91+
return nil, err
92+
}
93+
var subnetAzs []string
94+
if opts.CapacityReservation {
95+
subnetAzs, err = m.getAZsWithCapacity(opts)
96+
if err != nil {
97+
return nil, err
98+
}
99+
for _, az := range azs.AvailabilityZones {
100+
if len(subnetAzs) == 2 {
101+
break
102+
}
103+
if !slices.Contains(subnetAzs, *az.ZoneName) {
104+
subnetAzs = append(subnetAzs, *az.ZoneName)
105+
}
106+
}
107+
} else {
108+
for i := 0; i < 2; i++ {
109+
subnetAzs = append(subnetAzs, *azs.AvailabilityZones[i].ZoneName)
110+
}
111+
}
112+
klog.Infof("creating infrastructure stack with AZs: %v", subnetAzs)
87113
input := cloudformation.CreateStackInput{
88114
StackName: aws.String(m.resourceID),
89115
TemplateBody: aws.String(templates.Infrastructure),
@@ -97,6 +123,14 @@ func (m *InfrastructureManager) createInfrastructureStack(opts *deployerOptions)
97123
ParameterKey: aws.String("ResourceId"),
98124
ParameterValue: aws.String(m.resourceID),
99125
},
126+
{
127+
ParameterKey: aws.String("Subnet01AZ"),
128+
ParameterValue: aws.String(subnetAzs[0]),
129+
},
130+
{
131+
ParameterKey: aws.String("Subnet02AZ"),
132+
ParameterValue: aws.String(subnetAzs[1]),
133+
},
100134
},
101135
}
102136
if opts.ClusterRoleServicePrincipal != "" {
@@ -232,7 +266,7 @@ func (m *InfrastructureManager) deleteLeakedENIs() error {
232266
}
233267
}
234268
klog.Infof("deleted %d leaked ENI(s)!", len(enis))
235-
m.metrics.Record(infraLeakedENIs, float64(len(enis)), nil)
269+
m.metrics.Record(infraLeakedENIs, float64(len(enis)), nil)
236270
return nil
237271
}
238272

@@ -266,3 +300,29 @@ func (m *InfrastructureManager) getVPCCNINetworkInterfaceIds(vpcId string) ([]st
266300
}
267301
return enis, nil
268302
}
303+
304+
func (m *InfrastructureManager) getAZsWithCapacity(opts *deployerOptions) ([]string, error) {
305+
var subnetAzs []string
306+
capacityReservations, err := m.clients.EC2().DescribeCapacityReservations(context.TODO(), &ec2.DescribeCapacityReservationsInput{
307+
Filters: []ec2types.Filter{
308+
{
309+
Name: aws.String("instance-type"),
310+
Values: opts.InstanceTypes,
311+
},
312+
{
313+
Name: aws.String("state"),
314+
Values: []string{"active"},
315+
},
316+
},
317+
})
318+
if err != nil {
319+
return nil, err
320+
}
321+
for _, cr := range capacityReservations.CapacityReservations {
322+
if *cr.AvailableInstanceCount >= int32(opts.Nodes) {
323+
subnetAzs = append(subnetAzs, *cr.AvailabilityZone)
324+
break
325+
}
326+
}
327+
return subnetAzs, nil
328+
}

kubetest2/internal/deployers/eksapi/templates/infra.yaml

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ Parameters:
3939
ResourceId:
4040
Type: String
4141

42+
Subnet01AZ:
43+
Type: String
44+
45+
Subnet02AZ:
46+
Type: String
47+
4248
Metadata:
4349
AWS::CloudFormation::Interface:
4450
ParameterGroups:
@@ -275,10 +281,7 @@ Resources:
275281
DependsOn: IPv6CidrBlock
276282
Properties:
277283
AvailabilityZone:
278-
Fn::Select:
279-
- "0"
280-
- Fn::GetAZs:
281-
Ref: AWS::Region
284+
Ref: Subnet01AZ
282285
CidrBlock:
283286
Ref: PublicSubnet01Block
284287
Ipv6CidrBlock:
@@ -298,10 +301,7 @@ Resources:
298301
DependsOn: IPv6CidrBlock
299302
Properties:
300303
AvailabilityZone:
301-
Fn::Select:
302-
- "1"
303-
- Fn::GetAZs:
304-
Ref: AWS::Region
304+
Ref: Subnet02AZ
305305
CidrBlock:
306306
Ref: PublicSubnet02Block
307307
Ipv6CidrBlock:
@@ -343,10 +343,7 @@ Resources:
343343
DependsOn: IPv6CidrBlock
344344
Properties:
345345
AvailabilityZone:
346-
Fn::Select:
347-
- "0"
348-
- Fn::GetAZs:
349-
Ref: AWS::Region
346+
Ref: Subnet01AZ
350347
CidrBlock:
351348
Ref: PrivateSubnet01Block
352349
Ipv6CidrBlock:
@@ -365,10 +362,7 @@ Resources:
365362
DependsOn: IPv6CidrBlock
366363
Properties:
367364
AvailabilityZone:
368-
Fn::Select:
369-
- "1"
370-
- Fn::GetAZs:
371-
Ref: AWS::Region
365+
Ref: Subnet02AZ
372366
CidrBlock:
373367
Ref: PrivateSubnet02Block
374368
Ipv6CidrBlock:

0 commit comments

Comments
 (0)