Skip to content

Commit 504d29d

Browse files
committed
Add ai-ml nodeclasses and nodepools
Description / Motivation: Need to move the manually created nodeclasses and nodepools in perflab-titan-1 cluster to KIT for reuse in future runs. Related Asana Task: https://app.asana.com/1/8442528107068/project/1209254984904634/task/1211563354393458?focus=true Desktop Testing: Tested by creating a pipeline run https://experimental.scalability.eks.aws.dev/#/namespaces/scalability/pipelineruns/chithres-titan-ai-ml-pipeline-run-v27. Once this commit is merged I will also raise a PR for the ai-ml-load Pipeline. Currently the Pipeline has my KIT fork nodepools and nodeclasses URLs.
1 parent 5ca94cc commit 504d29d

7 files changed

+427
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
apiVersion: karpenter.sh/v1
2+
kind: NodePool
3+
metadata:
4+
name: ai-ml-inference-xlarge-${AZ}
5+
spec:
6+
disruption:
7+
budgets:
8+
- nodes: 10%
9+
consolidateAfter: 0s
10+
consolidationPolicy: WhenEmptyOrUnderutilized
11+
replicas: 10
12+
template:
13+
metadata:
14+
labels:
15+
purpose: ml-xlarge
16+
spec:
17+
expireAfter: 720h
18+
nodeClassRef:
19+
group: karpenter.k8s.aws
20+
kind: EC2NodeClass
21+
name: ai-training
22+
requirements:
23+
- key: topology.kubernetes.io/zone
24+
operator: In
25+
values:
26+
- ${AZ}
27+
- key: kubernetes.io/arch
28+
operator: In
29+
values:
30+
- amd64
31+
- key: kubernetes.io/os
32+
operator: In
33+
values:
34+
- linux
35+
- key: karpenter.sh/capacity-type
36+
operator: In
37+
values:
38+
- on-demand
39+
- key: node.kubernetes.io/instance-category
40+
operator: In
41+
values:
42+
- m
43+
- r
44+
- key: karpenter.k8s.aws/instance-size
45+
operator: In
46+
values:
47+
- xlarge
48+
- key: karpenter.k8s.aws/instance-generation
49+
operator: Gt
50+
values:
51+
- "6"
52+
- key: node.kubernetes.io/instance-type
53+
operator: NotIn
54+
values:
55+
- c7i-flex.xlarge
56+
- c7i.xlarge
57+
- c7a.xlarge
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
apiVersion: karpenter.sh/v1
2+
kind: NodePool
3+
metadata:
4+
name: ai-ml-monitoring-24xlarge-${AZ}
5+
spec:
6+
disruption:
7+
budgets:
8+
- nodes: 100%
9+
reasons:
10+
- Empty
11+
- nodes: 10%
12+
reasons:
13+
- Drifted
14+
- Underutilized
15+
consolidateAfter: 0s
16+
consolidationPolicy: WhenEmpty
17+
limits:
18+
nodes: "26400"
19+
replicas: 4
20+
template:
21+
metadata:
22+
labels:
23+
purpose: ml-24xlarge
24+
spec:
25+
expireAfter: 720h0m0s
26+
nodeClassRef:
27+
group: karpenter.k8s.aws
28+
kind: EC2NodeClass
29+
name: ai-training
30+
requirements:
31+
- key: topology.kubernetes.io/zone
32+
operator: In
33+
values:
34+
- ${AZ}
35+
- key: kubernetes.io/arch
36+
operator: In
37+
values:
38+
- amd64
39+
- key: kubernetes.io/os
40+
operator: In
41+
values:
42+
- linux
43+
- key: karpenter.sh/capacity-type
44+
operator: In
45+
values:
46+
- on-demand
47+
- key: karpenter.k8s.aws/instance-category
48+
operator: In
49+
values:
50+
- c
51+
- m
52+
- r
53+
- key: karpenter.k8s.aws/instance-size
54+
operator: In
55+
values:
56+
- 24xlarge
57+
- key: karpenter.k8s.aws/instance-generation
58+
operator: Gt
59+
values:
60+
- "4"
61+
taints:
62+
- effect: NoSchedule
63+
key: monitoring
64+
value: "true"
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
apiVersion: karpenter.sh/v1
2+
kind: NodePool
3+
metadata:
4+
name: ai-ml-operator-12xlarge-${AZ}
5+
spec:
6+
disruption:
7+
budgets:
8+
- nodes: 100%
9+
reasons:
10+
- Empty
11+
- nodes: 10%
12+
reasons:
13+
- Drifted
14+
- Underutilized
15+
consolidateAfter: 0s
16+
consolidationPolicy: WhenEmpty
17+
replicas: 5
18+
template:
19+
metadata:
20+
labels:
21+
purpose: ml-12xlarge
22+
spec:
23+
expireAfter: 720h
24+
nodeClassRef:
25+
group: karpenter.k8s.aws
26+
kind: EC2NodeClass
27+
name: ai-training
28+
requirements:
29+
- key: topology.kubernetes.io/zone
30+
operator: In
31+
values:
32+
- ${AZ}
33+
- key: kubernetes.io/arch
34+
operator: In
35+
values:
36+
- amd64
37+
- key: kubernetes.io/os
38+
operator: In
39+
values:
40+
- linux
41+
- key: karpenter.sh/capacity-type
42+
operator: In
43+
values:
44+
- on-demand
45+
- key: node.kubernetes.io/instance-category
46+
operator: In
47+
values:
48+
- c
49+
- m
50+
- r
51+
- key: karpenter.k8s.aws/instance-size
52+
operator: In
53+
values:
54+
- 12xlarge
55+
- key: karpenter.k8s.aws/instance-generation
56+
operator: Gt
57+
values:
58+
- "6"
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
apiVersion: karpenter.sh/v1
2+
kind: NodePool
3+
metadata:
4+
name: ai-ml-training-large-${AZ}
5+
spec:
6+
disruption:
7+
budgets:
8+
- nodes: 100%
9+
reasons:
10+
- Empty
11+
- nodes: 10%
12+
reasons:
13+
- Drifted
14+
- Underutilized
15+
consolidateAfter: 0s
16+
consolidationPolicy: WhenEmpty
17+
limits:
18+
nodes: "26400"
19+
replicas: 1
20+
template:
21+
metadata:
22+
labels:
23+
drift: drifting-test
24+
purpose: ml-large
25+
spec:
26+
expireAfter: 720h0m0s
27+
nodeClassRef:
28+
group: karpenter.k8s.aws
29+
kind: EC2NodeClass
30+
name: ai-training
31+
requirements:
32+
- key: topology.kubernetes.io/zone
33+
operator: In
34+
values:
35+
- ${AZ}
36+
- key: kubernetes.io/arch
37+
operator: In
38+
values:
39+
- amd64
40+
- key: kubernetes.io/os
41+
operator: In
42+
values:
43+
- linux
44+
- key: karpenter.sh/capacity-type
45+
operator: In
46+
values:
47+
- on-demand
48+
- key: karpenter.k8s.aws/instance-category
49+
operator: In
50+
values:
51+
- c
52+
- m
53+
- r
54+
- key: karpenter.k8s.aws/instance-size
55+
operator: In
56+
values:
57+
- medium
58+
- large
59+
- key: karpenter.k8s.aws/instance-generation
60+
operator: Gt
61+
values:
62+
- "4"
63+
- key: node.kubernetes.io/instance-type
64+
operator: NotIn
65+
values:
66+
- c7a.medium
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
apiVersion: karpenter.k8s.aws/v1
2+
kind: EC2NodeClass
3+
metadata:
4+
name: ai-training
5+
spec:
6+
amiFamily: AL2023
7+
amiSelectorTerms:
8+
- alias: "al2023@${ALIAS_VERSION}"
9+
blockDeviceMappings:
10+
- deviceName: /dev/xvda
11+
ebs:
12+
deleteOnTermination: true
13+
volumeSize: 70Gi
14+
volumeType: gp3
15+
kubelet:
16+
evictionHard:
17+
memory.available: 5%
18+
nodefs.available: 10%
19+
nodefs.inodesFree: 10%
20+
kubeReserved:
21+
cpu: 100m
22+
ephemeral-storage: 1Gi
23+
memory: 100Mi
24+
maxPods: 110
25+
systemReserved:
26+
cpu: 100m
27+
ephemeral-storage: 1Gi
28+
memory: 100Mi
29+
metadataOptions:
30+
httpEndpoint: enabled
31+
httpProtocolIPv6: disabled
32+
httpPutResponseHopLimit: 1
33+
httpTokens: required
34+
role: KarpenterNodeRole-${CLUSTER_NAME}
35+
securityGroupSelectorTerms:
36+
- tags:
37+
karpenter.sh/discovery: "${CLUSTER_NAME}"
38+
- tags:
39+
aws:cloudformation:stack-name: "${CLUSTER_NAME}"
40+
- tags:
41+
kubernetes.io/cluster/${CLUSTER_NAME}: owned
42+
subnetSelectorTerms:
43+
- tags:
44+
karpenter.sh/discovery: "${CLUSTER_NAME}"
45+
- tags:
46+
aws:cloudformation:stack-name: "${CLUSTER_NAME}"
47+
userData: |
48+
MIME-Version: 1.0
49+
Content-Type: multipart/mixed; boundary="BOUNDARY"
50+
51+
--BOUNDARY
52+
Content-Type: application/node.eks.aws
53+
54+
apiVersion: node.eks.aws/v1alpha1
55+
kind: NodeConfig
56+
spec:
57+
cluster:
58+
name: ${CLUSTER_NAME}
59+
apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint
60+
certificateAuthority: ${CLUSTER_CA}
61+
cidr: "172.20.0.0/16"
62+
kubelet:
63+
config:
64+
nodeStatusReportFrequency: "60m"
65+
nodeLeaseDurationSeconds: 120
66+
maxPods: 110
67+
clusterDNS: ["172.20.0.10"]
68+
flags:
69+
- --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool
70+
- --register-with-taints=karpenter.sh/unregistered:NoExecute
71+
--BOUNDARY--
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
apiVersion: karpenter.k8s.aws/v1
2+
kind: EC2NodeClass
3+
metadata:
4+
name: titan-class
5+
spec:
6+
amiFamily: Custom
7+
amiSelectorTerms:
8+
- alias: "al2023@${ALIAS_VERSION}"
9+
instanceProfile: KarpenterNodeInstanceProfile-${CLUSTER_NAME}
10+
kubelet:
11+
evictionHard:
12+
memory.available: 5%
13+
nodefs.available: 10%
14+
nodefs.inodesFree: 10%
15+
kubeReserved:
16+
cpu: 100m
17+
ephemeral-storage: 1Gi
18+
memory: 100Mi
19+
maxPods: 110
20+
systemReserved:
21+
cpu: 100m
22+
ephemeral-storage: 1Gi
23+
memory: 100Mi
24+
metadataOptions:
25+
httpEndpoint: enabled
26+
httpProtocolIPv6: disabled
27+
httpPutResponseHopLimit: 1
28+
httpTokens: required
29+
securityGroupSelectorTerms:
30+
- tags:
31+
karpenter.sh/discovery: "${CLUSTER_NAME}"
32+
- tags:
33+
aws:cloudformation:stack-name: "${CLUSTER_NAME}"
34+
- tags:
35+
kubernetes.io/cluster/${CLUSTER_NAME}: owned
36+
subnetSelectorTerms:
37+
- tags:
38+
karpenter.sh/discovery: "${CLUSTER_NAME}"
39+
- tags:
40+
aws:cloudformation:stack-name: "${CLUSTER_NAME}"
41+
userData: |
42+
MIME-Version: 1.0
43+
Content-Type: multipart/mixed; boundary="BOUNDARY"
44+
45+
--BOUNDARY
46+
Content-Type: application/node.eks.aws
47+
48+
apiVersion: node.eks.aws/v1alpha1
49+
kind: NodeConfig
50+
spec:
51+
cluster:
52+
name: ${CLUSTER_NAME}
53+
apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint
54+
certificateAuthority: ${CLUSTER_CA}
55+
cidr: "172.20.0.0/16"
56+
kubelet:
57+
config:
58+
nodeStatusReportFrequency: "60m"
59+
nodeLeaseDurationSeconds: 120
60+
maxPods: 110
61+
clusterDNS: ["172.20.0.10"]
62+
flags:
63+
- --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool
64+
- --register-with-taints=karpenter.sh/unregistered:NoExecute
65+
--BOUNDARY--

0 commit comments

Comments
 (0)