File tree Expand file tree Collapse file tree 7 files changed +427
-0
lines changed Expand file tree Collapse file tree 7 files changed +427
-0
lines changed Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.sh/v1
2+ kind : NodePool
3+ metadata :
4+ name : ai-ml-inference-xlarge-${AZ}
5+ spec :
6+ disruption :
7+ budgets :
8+ - nodes : 10%
9+ consolidateAfter : 0s
10+ consolidationPolicy : WhenEmptyOrUnderutilized
11+ replicas : 10
12+ template :
13+ metadata :
14+ labels :
15+ purpose : ml-xlarge
16+ spec :
17+ expireAfter : 720h
18+ nodeClassRef :
19+ group : karpenter.k8s.aws
20+ kind : EC2NodeClass
21+ name : ai-training
22+ requirements :
23+ - key : topology.kubernetes.io/zone
24+ operator : In
25+ values :
26+ - ${AZ}
27+ - key : kubernetes.io/arch
28+ operator : In
29+ values :
30+ - amd64
31+ - key : kubernetes.io/os
32+ operator : In
33+ values :
34+ - linux
35+ - key : karpenter.sh/capacity-type
36+ operator : In
37+ values :
38+ - on-demand
39+ - key : node.kubernetes.io/instance-category
40+ operator : In
41+ values :
42+ - m
43+ - r
44+ - key : karpenter.k8s.aws/instance-size
45+ operator : In
46+ values :
47+ - xlarge
48+ - key : karpenter.k8s.aws/instance-generation
49+ operator : Gt
50+ values :
51+ - " 6"
52+ - key : node.kubernetes.io/instance-type
53+ operator : NotIn
54+ values :
55+ - c7i-flex.xlarge
56+ - c7i.xlarge
57+ - c7a.xlarge
Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.sh/v1
2+ kind : NodePool
3+ metadata :
4+ name : ai-ml-monitoring-24xlarge-${AZ}
5+ spec :
6+ disruption :
7+ budgets :
8+ - nodes : 100%
9+ reasons :
10+ - Empty
11+ - nodes : 10%
12+ reasons :
13+ - Drifted
14+ - Underutilized
15+ consolidateAfter : 0s
16+ consolidationPolicy : WhenEmpty
17+ limits :
18+ nodes : " 26400"
19+ replicas : 4
20+ template :
21+ metadata :
22+ labels :
23+ purpose : ml-24xlarge
24+ spec :
25+ expireAfter : 720h0m0s
26+ nodeClassRef :
27+ group : karpenter.k8s.aws
28+ kind : EC2NodeClass
29+ name : ai-training
30+ requirements :
31+ - key : topology.kubernetes.io/zone
32+ operator : In
33+ values :
34+ - ${AZ}
35+ - key : kubernetes.io/arch
36+ operator : In
37+ values :
38+ - amd64
39+ - key : kubernetes.io/os
40+ operator : In
41+ values :
42+ - linux
43+ - key : karpenter.sh/capacity-type
44+ operator : In
45+ values :
46+ - on-demand
47+ - key : karpenter.k8s.aws/instance-category
48+ operator : In
49+ values :
50+ - c
51+ - m
52+ - r
53+ - key : karpenter.k8s.aws/instance-size
54+ operator : In
55+ values :
56+ - 24xlarge
57+ - key : karpenter.k8s.aws/instance-generation
58+ operator : Gt
59+ values :
60+ - " 4"
61+ taints :
62+ - effect : NoSchedule
63+ key : monitoring
64+ value : " true"
Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.sh/v1
2+ kind : NodePool
3+ metadata :
4+ name : ai-ml-operator-12xlarge-${AZ}
5+ spec :
6+ disruption :
7+ budgets :
8+ - nodes : 100%
9+ reasons :
10+ - Empty
11+ - nodes : 10%
12+ reasons :
13+ - Drifted
14+ - Underutilized
15+ consolidateAfter : 0s
16+ consolidationPolicy : WhenEmpty
17+ replicas : 5
18+ template :
19+ metadata :
20+ labels :
21+ purpose : ml-12xlarge
22+ spec :
23+ expireAfter : 720h
24+ nodeClassRef :
25+ group : karpenter.k8s.aws
26+ kind : EC2NodeClass
27+ name : ai-training
28+ requirements :
29+ - key : topology.kubernetes.io/zone
30+ operator : In
31+ values :
32+ - ${AZ}
33+ - key : kubernetes.io/arch
34+ operator : In
35+ values :
36+ - amd64
37+ - key : kubernetes.io/os
38+ operator : In
39+ values :
40+ - linux
41+ - key : karpenter.sh/capacity-type
42+ operator : In
43+ values :
44+ - on-demand
45+ - key : node.kubernetes.io/instance-category
46+ operator : In
47+ values :
48+ - c
49+ - m
50+ - r
51+ - key : karpenter.k8s.aws/instance-size
52+ operator : In
53+ values :
54+ - 12xlarge
55+ - key : karpenter.k8s.aws/instance-generation
56+ operator : Gt
57+ values :
58+ - " 6"
Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.sh/v1
2+ kind : NodePool
3+ metadata :
4+ name : ai-ml-training-large-${AZ}
5+ spec :
6+ disruption :
7+ budgets :
8+ - nodes : 100%
9+ reasons :
10+ - Empty
11+ - nodes : 10%
12+ reasons :
13+ - Drifted
14+ - Underutilized
15+ consolidateAfter : 0s
16+ consolidationPolicy : WhenEmpty
17+ limits :
18+ nodes : " 26400"
19+ replicas : 1
20+ template :
21+ metadata :
22+ labels :
23+ drift : drifting-test
24+ purpose : ml-large
25+ spec :
26+ expireAfter : 720h0m0s
27+ nodeClassRef :
28+ group : karpenter.k8s.aws
29+ kind : EC2NodeClass
30+ name : ai-training
31+ requirements :
32+ - key : topology.kubernetes.io/zone
33+ operator : In
34+ values :
35+ - ${AZ}
36+ - key : kubernetes.io/arch
37+ operator : In
38+ values :
39+ - amd64
40+ - key : kubernetes.io/os
41+ operator : In
42+ values :
43+ - linux
44+ - key : karpenter.sh/capacity-type
45+ operator : In
46+ values :
47+ - on-demand
48+ - key : karpenter.k8s.aws/instance-category
49+ operator : In
50+ values :
51+ - c
52+ - m
53+ - r
54+ - key : karpenter.k8s.aws/instance-size
55+ operator : In
56+ values :
57+ - medium
58+ - large
59+ - key : karpenter.k8s.aws/instance-generation
60+ operator : Gt
61+ values :
62+ - " 4"
63+ - key : node.kubernetes.io/instance-type
64+ operator : NotIn
65+ values :
66+ - c7a.medium
Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.k8s.aws/v1
2+ kind : EC2NodeClass
3+ metadata :
4+ name : ai-training
5+ spec :
6+ amiFamily : AL2023
7+ amiSelectorTerms :
8+ - alias : " al2023@${ALIAS_VERSION}"
9+ blockDeviceMappings :
10+ - deviceName : /dev/xvda
11+ ebs :
12+ deleteOnTermination : true
13+ volumeSize : 70Gi
14+ volumeType : gp3
15+ kubelet :
16+ evictionHard :
17+ memory.available : 5%
18+ nodefs.available : 10%
19+ nodefs.inodesFree : 10%
20+ kubeReserved :
21+ cpu : 100m
22+ ephemeral-storage : 1Gi
23+ memory : 100Mi
24+ maxPods : 110
25+ systemReserved :
26+ cpu : 100m
27+ ephemeral-storage : 1Gi
28+ memory : 100Mi
29+ metadataOptions :
30+ httpEndpoint : enabled
31+ httpProtocolIPv6 : disabled
32+ httpPutResponseHopLimit : 1
33+ httpTokens : required
34+ role : KarpenterNodeRole-${CLUSTER_NAME}
35+ securityGroupSelectorTerms :
36+ - tags :
37+ karpenter.sh/discovery : " ${CLUSTER_NAME}"
38+ - tags :
39+ aws:cloudformation:stack-name : " ${CLUSTER_NAME}"
40+ - tags :
41+ kubernetes.io/cluster/${CLUSTER_NAME} : owned
42+ subnetSelectorTerms :
43+ - tags :
44+ karpenter.sh/discovery : " ${CLUSTER_NAME}"
45+ - tags :
46+ aws:cloudformation:stack-name : " ${CLUSTER_NAME}"
47+ userData : |
48+ MIME-Version: 1.0
49+ Content-Type: multipart/mixed; boundary="BOUNDARY"
50+
51+ --BOUNDARY
52+ Content-Type: application/node.eks.aws
53+
54+ apiVersion: node.eks.aws/v1alpha1
55+ kind: NodeConfig
56+ spec:
57+ cluster:
58+ name: ${CLUSTER_NAME}
59+ apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint
60+ certificateAuthority: ${CLUSTER_CA}
61+ cidr: "172.20.0.0/16"
62+ kubelet:
63+ config:
64+ nodeStatusReportFrequency: "60m"
65+ nodeLeaseDurationSeconds: 120
66+ maxPods: 110
67+ clusterDNS: ["172.20.0.10"]
68+ flags:
69+ - --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool
70+ - --register-with-taints=karpenter.sh/unregistered:NoExecute
71+ --BOUNDARY--
Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.k8s.aws/v1
2+ kind : EC2NodeClass
3+ metadata :
4+ name : titan-class
5+ spec :
6+ amiFamily : Custom
7+ amiSelectorTerms :
8+ - alias : " al2023@${ALIAS_VERSION}"
9+ instanceProfile : KarpenterNodeInstanceProfile-${CLUSTER_NAME}
10+ kubelet :
11+ evictionHard :
12+ memory.available : 5%
13+ nodefs.available : 10%
14+ nodefs.inodesFree : 10%
15+ kubeReserved :
16+ cpu : 100m
17+ ephemeral-storage : 1Gi
18+ memory : 100Mi
19+ maxPods : 110
20+ systemReserved :
21+ cpu : 100m
22+ ephemeral-storage : 1Gi
23+ memory : 100Mi
24+ metadataOptions :
25+ httpEndpoint : enabled
26+ httpProtocolIPv6 : disabled
27+ httpPutResponseHopLimit : 1
28+ httpTokens : required
29+ securityGroupSelectorTerms :
30+ - tags :
31+ karpenter.sh/discovery : " ${CLUSTER_NAME}"
32+ - tags :
33+ aws:cloudformation:stack-name : " ${CLUSTER_NAME}"
34+ - tags :
35+ kubernetes.io/cluster/${CLUSTER_NAME} : owned
36+ subnetSelectorTerms :
37+ - tags :
38+ karpenter.sh/discovery : " ${CLUSTER_NAME}"
39+ - tags :
40+ aws:cloudformation:stack-name : " ${CLUSTER_NAME}"
41+ userData : |
42+ MIME-Version: 1.0
43+ Content-Type: multipart/mixed; boundary="BOUNDARY"
44+
45+ --BOUNDARY
46+ Content-Type: application/node.eks.aws
47+
48+ apiVersion: node.eks.aws/v1alpha1
49+ kind: NodeConfig
50+ spec:
51+ cluster:
52+ name: ${CLUSTER_NAME}
53+ apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint
54+ certificateAuthority: ${CLUSTER_CA}
55+ cidr: "172.20.0.0/16"
56+ kubelet:
57+ config:
58+ nodeStatusReportFrequency: "60m"
59+ nodeLeaseDurationSeconds: 120
60+ maxPods: 110
61+ clusterDNS: ["172.20.0.10"]
62+ flags:
63+ - --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool
64+ - --register-with-taints=karpenter.sh/unregistered:NoExecute
65+ --BOUNDARY--
You can’t perform that action at this time.
0 commit comments