Skip to content

Commit acdfcba

Browse files
Merge pull request #2764 from Nordix/Sunnatillo/add-failure-domain
✨ Add FailureDomain Support
2 parents 226c65d + bced137 commit acdfcba

11 files changed

+282
-12
lines changed

api/v1beta1/metal3cluster_types.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,15 @@ type Metal3ClusterSpec struct {
6666
// Default value is true, it is set in the webhook.
6767
// +optional
6868
CloudProviderEnabled *bool `json:"cloudProviderEnabled,omitempty"`
69+
70+
// FailureDomains specifies a list fo failure zones that can be used
71+
// +optional
72+
FailureDomains FailureDomains `json:"failureDomains,omitempty"`
6973
}
7074

75+
// FailureDomains is a slice of FailureDomainSpecs.
76+
type FailureDomains clusterv1beta1.FailureDomains
77+
7178
// IsValid returns an error if the object is not valid, otherwise nil. The
7279
// string representation of the error is suitable for human consumption.
7380
func (s *Metal3ClusterSpec) IsValid() error {
@@ -115,6 +122,9 @@ type Metal3ClusterStatus struct {
115122
// v1beta2 groups all the fields that will be added or modified in Metal3Cluster's status with the V1Beta2 version.
116123
// +optional
117124
V1Beta2 *Metal3ClusterV1Beta2Status `json:"v1beta2,omitempty"`
125+
// FailureDomains specifies a list fo failure zones that can be used
126+
// +optional
127+
FailureDomains FailureDomains `json:"failureDomains,omitempty"`
118128
}
119129

120130
// Metal3ClusterV1Beta2Status groups all the fields that will be added or modified in Metal3ClusterStatus with the V1Beta2 version.

api/v1beta1/metal3machine_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,9 @@ type Metal3MachineSpec struct {
160160
// +kubebuilder:validation:Enum:=metadata;disabled
161161
// +optional
162162
AutomatedCleaningMode *string `json:"automatedCleaningMode,omitempty"`
163+
164+
// FailureDomain is the failure domain unique identifier this Machine should be attached to, as defined in Cluster API.
165+
FailureDomain string `json:"failureDomain,omitempty"`
163166
}
164167

165168
// Metal3MachineStatus defines the observed state of Metal3Machine.

api/v1beta1/zz_generated.deepcopy.go

Lines changed: 35 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

baremetal/metal3machine_manager.go

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ const (
7676
ProviderIDPrefix = "metal3://"
7777
// ProviderLabelPrefix is a label prefix for ProviderID.
7878
ProviderLabelPrefix = "metal3.io/uuid"
79+
// FailureDomainLabelName is a label name for FailureDomains.
80+
FailureDomainLabelName = "infrastructure.cluster.x-k8s.io/failure-domain"
7981
)
8082

8183
var (
@@ -924,9 +926,11 @@ func (m *MachineManager) chooseHost(ctx context.Context) (*bmov1alpha1.BareMetal
924926
// If host is found in `Ready` state, pick it
925927
if len(hostsInAvailableStateWithNodeReuse) != 0 {
926928
m.Log.Info("Found host(s) with nodeReuseLabelName in Ready/Available state, choosing the host", "availabeHostCount", len(hostsInAvailableStateWithNodeReuse), "host", host.Name)
927-
rHost, _ := rand.Int(rand.Reader, big.NewInt(int64(len(hostsInAvailableStateWithNodeReuse))))
928-
randomHost := rHost.Int64()
929-
chosenHost = hostsInAvailableStateWithNodeReuse[randomHost]
929+
chosenHost, err = m.pickHost(hostsInAvailableStateWithNodeReuse)
930+
if err != nil {
931+
m.Log.Error(err, "Failed to choose host, not choosing host")
932+
return nil, nil, err
933+
}
930934
} else if len(hostsInNotAvailableStateWithNodeReuse) != 0 {
931935
errMessage := fmt.Sprint("Found BareMetalHost(s) with nodeReuseLabelName in not-available state, requeuing the BareMetalHost", "notAvailabeHostCount", len(hostsInNotAvailableStateWithNodeReuse), "hoststate", host.Status.Provisioning.State, "host", host.Name)
932936
m.Log.Info(errMessage)
@@ -937,9 +941,11 @@ func (m *MachineManager) chooseHost(ctx context.Context) (*bmov1alpha1.BareMetal
937941
// If there are no hosts with nodeReuseLabelName, fall back
938942
// to the current flow and select hosts randomly.
939943
m.Log.Info("host(s) count available, choosing a random host", "availabeHostCount", len(availableHosts))
940-
rHost, _ := rand.Int(rand.Reader, big.NewInt(int64(len(availableHosts))))
941-
randomHost := rHost.Int64()
942-
chosenHost = availableHosts[randomHost]
944+
chosenHost, err = m.pickHost(availableHosts)
945+
if err != nil {
946+
m.Log.Error(err, "Failed to choose host, not choosing host")
947+
return nil, nil, err
948+
}
943949
}
944950

945951
helper, err := v1beta1patch.NewHelper(chosenHost, m.client)
@@ -2115,3 +2121,46 @@ func (m *MachineManager) duplicateProviderIDsExist(validNodes map[string][]corev
21152121
}
21162122
return nil
21172123
}
2124+
2125+
// Picks host from list of available hosts, if failureDomain is set, tries to choose from hosts in failureDomain.
2126+
// When none available in failureDomain it chooses from all available hosts.
2127+
func (m *MachineManager) pickHost(availableHosts []*bmov1alpha1.BareMetalHost) (*bmov1alpha1.BareMetalHost, error) {
2128+
var chosenHost *bmov1alpha1.BareMetalHost
2129+
var availableHostsInFailureDomain []*bmov1alpha1.BareMetalHost
2130+
2131+
// When failureDomain is set, create a list from available hosts in failureDomain
2132+
if m.Metal3Machine.Spec.FailureDomain != "" {
2133+
labelSelector := labels.NewSelector()
2134+
var reqs labels.Requirements
2135+
var r *labels.Requirement
2136+
r, err := labels.NewRequirement(FailureDomainLabelName, selection.Equals, []string{m.Metal3Machine.Spec.FailureDomain})
2137+
2138+
if err != nil {
2139+
m.Log.Error(err, "Failed to create FailureDomain MatchLabel requirement, not choosing host")
2140+
return nil, err
2141+
}
2142+
reqs = append(reqs, *r)
2143+
labelSelector = labelSelector.Add(reqs...)
2144+
2145+
for _, host := range availableHosts {
2146+
if labelSelector.Matches(labels.Set(host.ObjectMeta.Labels)) {
2147+
availableHostsInFailureDomain = append(availableHostsInFailureDomain, host)
2148+
}
2149+
}
2150+
if len(availableHostsInFailureDomain) == 0 {
2151+
m.Log.Info("No available hosts in FailureDomain", m.Metal3Machine.Spec.FailureDomain, "choosing from other available hosts")
2152+
}
2153+
}
2154+
2155+
if len(availableHostsInFailureDomain) > 0 {
2156+
rHost, _ := rand.Int(rand.Reader, big.NewInt(int64(len(availableHostsInFailureDomain))))
2157+
randomHost := rHost.Int64()
2158+
chosenHost = availableHostsInFailureDomain[randomHost]
2159+
} else {
2160+
rHost, _ := rand.Int(rand.Reader, big.NewInt(int64(len(availableHosts))))
2161+
randomHost := rHost.Int64()
2162+
chosenHost = availableHosts[randomHost]
2163+
}
2164+
2165+
return chosenHost, nil
2166+
}

baremetal/metal3machine_manager_test.go

Lines changed: 99 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,18 @@ var _ = Describe("Metal3Machine manager", func() {
570570
},
571571
},
572572
}
573+
hostWithFailureDomainLabel := bmov1alpha1.BareMetalHost{
574+
ObjectMeta: metav1.ObjectMeta{
575+
Name: "hostWithFailureDomainLabel",
576+
Namespace: namespaceName,
577+
Labels: map[string]string{FailureDomainLabelName: "my-fd-1"},
578+
},
579+
Status: bmov1alpha1.BareMetalHostStatus{
580+
Provisioning: bmov1alpha1.ProvisionStatus{
581+
State: bmov1alpha1.StateAvailable,
582+
},
583+
},
584+
}
573585
hostWithUnhealthyAnnotation := bmov1alpha1.BareMetalHost{
574586
ObjectMeta: metav1.ObjectMeta{
575587
Name: "hostWithUnhealthyAnnotation",
@@ -607,6 +619,19 @@ var _ = Describe("Metal3Machine manager", func() {
607619
},
608620
},
609621
}
622+
hostWithNodeReuseLabelSetToCPinFailureDomain := bmov1alpha1.BareMetalHost{
623+
ObjectMeta: metav1.ObjectMeta{
624+
Name: "hostWithNodeReuseLabelSetToCPinFailureDomain",
625+
Namespace: namespaceName,
626+
Labels: map[string]string{nodeReuseLabelName: "cp-test1", FailureDomainLabelName: "my-fd-1"},
627+
},
628+
Spec: bmov1alpha1.BareMetalHostSpec{},
629+
Status: bmov1alpha1.BareMetalHostStatus{
630+
Provisioning: bmov1alpha1.ProvisionStatus{
631+
State: bmov1alpha1.StateAvailable,
632+
},
633+
},
634+
}
610635
hostWithNodeReuseLabelStateNone := bmov1alpha1.BareMetalHost{
611636
ObjectMeta: metav1.ObjectMeta{
612637
Name: "hostWithNodeReuseLabelStateNone",
@@ -622,13 +647,13 @@ var _ = Describe("Metal3Machine manager", func() {
622647
}
623648

624649
m3mconfig, infrastructureRef := newConfig("", map[string]string{},
625-
[]infrav1.HostSelectorRequirement{},
650+
[]infrav1.HostSelectorRequirement{}, "",
626651
)
627652
m3mconfig2, infrastructureRef2 := newConfig("",
628-
map[string]string{"key1": "value1"}, []infrav1.HostSelectorRequirement{},
653+
map[string]string{"key1": "value1"}, []infrav1.HostSelectorRequirement{}, "",
629654
)
630655
m3mconfig3, infrastructureRef3 := newConfig("",
631-
map[string]string{"boguskey": "value"}, []infrav1.HostSelectorRequirement{},
656+
map[string]string{"boguskey": "value"}, []infrav1.HostSelectorRequirement{}, "",
632657
)
633658
m3mconfig4, infrastructureRef4 := newConfig("", map[string]string{},
634659
[]infrav1.HostSelectorRequirement{
@@ -638,6 +663,7 @@ var _ = Describe("Metal3Machine manager", func() {
638663
Values: []string{"abc", "value1", "123"},
639664
},
640665
},
666+
"",
641667
)
642668
m3mconfig5, infrastructureRef5 := newConfig("", map[string]string{},
643669
[]infrav1.HostSelectorRequirement{
@@ -647,6 +673,10 @@ var _ = Describe("Metal3Machine manager", func() {
647673
Values: []string{"abc", "value1", "123"},
648674
},
649675
},
676+
"",
677+
)
678+
m3mconfig6, infrastructureRef6 := newConfig("", map[string]string{},
679+
[]infrav1.HostSelectorRequirement{}, "my-fd-1",
650680
)
651681

652682
type testCaseChooseHost struct {
@@ -849,6 +879,68 @@ var _ = Describe("Metal3Machine manager", func() {
849879
M3Machine: m3mconfig5,
850880
ExpectedHostName: "",
851881
}),
882+
Entry("Choose a host in Failure Domain", testCaseChooseHost{
883+
Machine: newMachine(machineName, infrastructureRef6),
884+
Hosts: &bmov1alpha1.BareMetalHostList{Items: []bmov1alpha1.BareMetalHost{*availableHost, hostWithLabel, hostWithOtherConsRef, hostWithFailureDomainLabel}},
885+
M3Machine: m3mconfig6,
886+
ExpectedHostName: hostWithFailureDomainLabel.Name,
887+
}),
888+
Entry("Choose available host, when hosts in FailureDomain not available", testCaseChooseHost{
889+
Machine: newMachine(machineName, infrastructureRef6),
890+
Hosts: &bmov1alpha1.BareMetalHostList{Items: []bmov1alpha1.BareMetalHost{*availableHost, hostWithOtherConsRef, hostWithNodeReuseLabelSetToCP}},
891+
M3Machine: m3mconfig6,
892+
ExpectedHostName: availableHost.Name,
893+
}),
894+
Entry("Choose a host in Failure Domain, when NodeReuse is set", testCaseChooseHost{
895+
Machine: &clusterv1.Machine{
896+
ObjectMeta: metav1.ObjectMeta{
897+
Name: machineName,
898+
Namespace: namespaceName,
899+
900+
OwnerReferences: []metav1.OwnerReference{
901+
{
902+
APIVersion: "controlplane.cluster.x-k8s.io/v1beta2",
903+
Name: "test1",
904+
Kind: "KubeadmControlPlane",
905+
},
906+
},
907+
Labels: map[string]string{
908+
clusterv1.MachineControlPlaneLabel: "cluster.x-k8s.io/control-plane",
909+
},
910+
},
911+
Spec: clusterv1.MachineSpec{
912+
InfrastructureRef: *infrastructureRef,
913+
},
914+
},
915+
Hosts: &bmov1alpha1.BareMetalHostList{Items: []bmov1alpha1.BareMetalHost{*availableHost, hostWithLabel, hostWithFailureDomainLabel, hostWithNodeReuseLabelSetToCPinFailureDomain}},
916+
M3Machine: m3mconfig6,
917+
ExpectedHostName: hostWithNodeReuseLabelSetToCPinFailureDomain.Name,
918+
}),
919+
Entry("Choose host is not in Failure Domain, when NodeReuse is set", testCaseChooseHost{
920+
Machine: &clusterv1.Machine{
921+
ObjectMeta: metav1.ObjectMeta{
922+
Name: machineName,
923+
Namespace: namespaceName,
924+
925+
OwnerReferences: []metav1.OwnerReference{
926+
{
927+
APIVersion: "controlplane.cluster.x-k8s.io/v1beta2",
928+
Name: "test1",
929+
Kind: "KubeadmControlPlane",
930+
},
931+
},
932+
Labels: map[string]string{
933+
clusterv1.MachineControlPlaneLabel: "cluster.x-k8s.io/control-plane",
934+
},
935+
},
936+
Spec: clusterv1.MachineSpec{
937+
InfrastructureRef: *infrastructureRef,
938+
},
939+
},
940+
Hosts: &bmov1alpha1.BareMetalHostList{Items: []bmov1alpha1.BareMetalHost{*availableHost, hostWithLabel, hostWithFailureDomainLabel, hostWithNodeReuseLabelSetToCP}},
941+
M3Machine: m3mconfig6,
942+
ExpectedHostName: hostWithNodeReuseLabelSetToCP.Name,
943+
}),
852944
)
853945
})
854946

@@ -1085,7 +1177,7 @@ var _ = Describe("Metal3Machine manager", func() {
10851177
fakeClient := fake.NewClientBuilder().WithScheme(setupSchemeMm()).WithObjects(tc.Host).Build()
10861178

10871179
m3mconfig, infrastructureRef := newConfig(tc.UserDataNamespace,
1088-
map[string]string{}, []infrav1.HostSelectorRequirement{},
1180+
map[string]string{}, []infrav1.HostSelectorRequirement{}, "",
10891181
)
10901182
if tc.UseCustomDeploy != nil {
10911183
m3mconfig.Spec.Image = infrav1.Image{}
@@ -1205,7 +1297,7 @@ var _ = Describe("Metal3Machine manager", func() {
12051297
fakeClient := fake.NewClientBuilder().WithScheme(setupSchemeMm()).WithObjects(tc.Host).Build()
12061298

12071299
m3mconfig, infrastructureRef := newConfig(tc.UserDataNamespace,
1208-
map[string]string{}, []infrav1.HostSelectorRequirement{},
1300+
map[string]string{}, []infrav1.HostSelectorRequirement{}, "",
12091301
)
12101302
machine := newMachine(machineName, infrastructureRef)
12111303

@@ -4579,7 +4671,7 @@ func setupSchemeMm() *runtime.Scheme {
45794671
}
45804672

45814673
func newConfig(userDataNamespace string,
4582-
labels map[string]string, reqs []infrav1.HostSelectorRequirement,
4674+
labels map[string]string, reqs []infrav1.HostSelectorRequirement, failureDomain string,
45834675
) (*infrav1.Metal3Machine, *clusterv1.ContractVersionedObjectReference) {
45844676
config := infrav1.Metal3Machine{
45854677
ObjectMeta: metav1.ObjectMeta{
@@ -4599,6 +4691,7 @@ func newConfig(userDataNamespace string,
45994691
MatchLabels: labels,
46004692
MatchExpressions: reqs,
46014693
},
4694+
FailureDomain: failureDomain,
46024695
},
46034696
Status: infrav1.Metal3MachineStatus{
46044697
UserData: &corev1.SecretReference{

0 commit comments

Comments
 (0)