Skip to content

Commit 9ea1f58

Browse files
⚠️ Improve chained upgrade observability (#12973)
* Improve chained upgrade observability * Address comments
1 parent 3f969d8 commit 9ea1f58

18 files changed

+1296
-830
lines changed

api/core/v1beta2/cluster_types.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,17 +80,24 @@ const (
8080
// failing due to an error.
8181
ClusterTopologyReconciledFailedReason = "ReconcileFailed"
8282

83+
// ClusterTopologyReconciledClusterCreatingReason documents reconciliation of a Cluster topology
84+
// not yet created because the BeforeClusterCreate hook is blocking.
85+
ClusterTopologyReconciledClusterCreatingReason = "ClusterCreating"
86+
8387
// ClusterTopologyReconciledControlPlaneUpgradePendingReason documents reconciliation of a Cluster topology
8488
// not yet completed because Control Plane is not yet updated to match the desired topology spec.
89+
// Deprecated: please use ClusterUpgrading instead.
8590
ClusterTopologyReconciledControlPlaneUpgradePendingReason = "ControlPlaneUpgradePending"
8691

8792
// ClusterTopologyReconciledMachineDeploymentsCreatePendingReason documents reconciliation of a Cluster topology
8893
// not yet completed because at least one of the MachineDeployments is yet to be created.
8994
// This generally happens because new MachineDeployment creations are held off while the ControlPlane is not stable.
95+
// Deprecated: please use ClusterUpgrading instead.
9096
ClusterTopologyReconciledMachineDeploymentsCreatePendingReason = "MachineDeploymentsCreatePending"
9197

9298
// ClusterTopologyReconciledMachineDeploymentsUpgradePendingReason documents reconciliation of a Cluster topology
9399
// not yet completed because at least one of the MachineDeployments is not yet updated to match the desired topology spec.
100+
// Deprecated: please use ClusterUpgrading instead.
94101
ClusterTopologyReconciledMachineDeploymentsUpgradePendingReason = "MachineDeploymentsUpgradePending"
95102

96103
// ClusterTopologyReconciledMachineDeploymentsUpgradeDeferredReason documents reconciliation of a Cluster topology
@@ -99,11 +106,13 @@ const (
99106

100107
// ClusterTopologyReconciledMachinePoolsUpgradePendingReason documents reconciliation of a Cluster topology
101108
// not yet completed because at least one of the MachinePools is not yet updated to match the desired topology spec.
109+
// Deprecated: please use ClusterUpgrading instead.
102110
ClusterTopologyReconciledMachinePoolsUpgradePendingReason = "MachinePoolsUpgradePending"
103111

104112
// ClusterTopologyReconciledMachinePoolsCreatePendingReason documents reconciliation of a Cluster topology
105113
// not yet completed because at least one of the MachinePools is yet to be created.
106114
// This generally happens because new MachinePool creations are held off while the ControlPlane is not stable.
115+
// Deprecated: please use ClusterUpgrading instead.
107116
ClusterTopologyReconciledMachinePoolsCreatePendingReason = "MachinePoolsCreatePending"
108117

109118
// ClusterTopologyReconciledMachinePoolsUpgradeDeferredReason documents reconciliation of a Cluster topology
@@ -112,8 +121,12 @@ const (
112121

113122
// ClusterTopologyReconciledHookBlockingReason documents reconciliation of a Cluster topology
114123
// not yet completed because at least one of the lifecycle hooks is blocking.
124+
// Deprecated: please use ClusterUpgrading instead.
115125
ClusterTopologyReconciledHookBlockingReason = "LifecycleHookBlocking"
116126

127+
// ClusterTopologyReconciledClusterUpgradingReason documents reconciliation of a Cluster topology
128+
// not yet completed because a cluster upgrade is still in progress.
129+
ClusterTopologyReconciledClusterUpgradingReason = "ClusterUpgrading"
117130
// ClusterTopologyReconciledClusterClassNotReconciledReason documents reconciliation of a Cluster topology not
118131
// yet completed because the ClusterClass has not reconciled yet. If this condition persists there may be an issue
119132
// with the ClusterClass surfaced in the ClusterClass status or controller logs.

api/core/v1beta2/v1beta1_condition_consts.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,17 +300,24 @@ const (
300300
// failing due to an error.
301301
TopologyReconcileFailedV1Beta1Reason = "TopologyReconcileFailed"
302302

303+
// TopologyReconciledClusterCreatingV1Beta1Reason documents reconciliation of a Cluster topology
304+
// not yet created because the BeforeClusterCreate hook is blocking.
305+
TopologyReconciledClusterCreatingV1Beta1Reason = "ClusterCreating"
306+
303307
// TopologyReconciledControlPlaneUpgradePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
304308
// not yet completed because Control Plane is not yet updated to match the desired topology spec.
309+
// Deprecated: please use ClusterUpgrading instead.
305310
TopologyReconciledControlPlaneUpgradePendingV1Beta1Reason = "ControlPlaneUpgradePending"
306311

307312
// TopologyReconciledMachineDeploymentsCreatePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
308313
// not yet completed because at least one of the MachineDeployments is yet to be created.
309314
// This generally happens because new MachineDeployment creations are held off while the ControlPlane is not stable.
315+
// Deprecated: please use ClusterUpgrading instead.
310316
TopologyReconciledMachineDeploymentsCreatePendingV1Beta1Reason = "MachineDeploymentsCreatePending"
311317

312318
// TopologyReconciledMachineDeploymentsUpgradePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
313319
// not yet completed because at least one of the MachineDeployments is not yet updated to match the desired topology spec.
320+
// Deprecated: please use ClusterUpgrading instead.
314321
TopologyReconciledMachineDeploymentsUpgradePendingV1Beta1Reason = "MachineDeploymentsUpgradePending"
315322

316323
// TopologyReconciledMachineDeploymentsUpgradeDeferredV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
@@ -319,11 +326,13 @@ const (
319326

320327
// TopologyReconciledMachinePoolsUpgradePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
321328
// not yet completed because at least one of the MachinePools is not yet updated to match the desired topology spec.
329+
// Deprecated: please use ClusterUpgrading instead.
322330
TopologyReconciledMachinePoolsUpgradePendingV1Beta1Reason = "MachinePoolsUpgradePending"
323331

324332
// TopologyReconciledMachinePoolsCreatePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
325333
// not yet completed because at least one of the MachinePools is yet to be created.
326334
// This generally happens because new MachinePool creations are held off while the ControlPlane is not stable.
335+
// Deprecated: please use ClusterUpgrading instead.
327336
TopologyReconciledMachinePoolsCreatePendingV1Beta1Reason = "MachinePoolsCreatePending"
328337

329338
// TopologyReconciledMachinePoolsUpgradeDeferredV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
@@ -332,8 +341,13 @@ const (
332341

333342
// TopologyReconciledHookBlockingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
334343
// not yet completed because at least one of the lifecycle hooks is blocking.
344+
// Deprecated: please use ClusterUpgrading instead.
335345
TopologyReconciledHookBlockingV1Beta1Reason = "LifecycleHookBlocking"
336346

347+
// TopologyReconciledClusterUpgradingV1Beta1Reason documents reconciliation of a Cluster topology
348+
// not yet completed because a cluster upgrade is still in progress.
349+
TopologyReconciledClusterUpgradingV1Beta1Reason = "ClusterUpgrading"
350+
337351
// TopologyReconciledClusterClassNotReconciledV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology not
338352
// yet completed because the ClusterClass has not reconciled yet. If this condition persists there may be an issue
339353
// with the ClusterClass surfaced in the ClusterClass status or controller logs.

exp/topology/desiredstate/desired_state.go

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package desiredstate
1919

2020
import (
2121
"context"
22+
"fmt"
2223
"maps"
2324
"reflect"
2425
"time"
@@ -30,6 +31,7 @@ import (
3031
"k8s.io/apimachinery/pkg/runtime/schema"
3132
"k8s.io/klog/v2"
3233
"k8s.io/utils/ptr"
34+
ctrl "sigs.k8s.io/controller-runtime"
3335
"sigs.k8s.io/controller-runtime/pkg/client"
3436

3537
clusterv1beta1 "sigs.k8s.io/cluster-api/api/core/v1beta1"
@@ -507,6 +509,8 @@ func (g *generator) computeControlPlane(ctx context.Context, s *scope.Scope, inf
507509
// The version is calculated using the state of the current machine deployments, the current control plane
508510
// and the version defined in the topology.
509511
func (g *generator) computeControlPlaneVersion(ctx context.Context, s *scope.Scope) (string, error) {
512+
log := ctrl.LoggerFrom(ctx)
513+
510514
topologyVersion := s.Blueprint.Topology.Version
511515
// If we are creating the control plane object (current control plane is nil), use version from topology.
512516
if s.Current.ControlPlane == nil || s.Current.ControlPlane.Object == nil {
@@ -599,8 +603,7 @@ func (g *generator) computeControlPlaneVersion(ctx context.Context, s *scope.Sco
599603
// Also check if MachineDeployments/MachinePools are already upgrading.
600604
// If the MachineDeployments/MachinePools are upgrading, then do not pick up the next control plane version yet.
601605
// We will pick up the new version after the MachineDeployments/MachinePools finish upgrading.
602-
if len(s.UpgradeTracker.MachineDeployments.UpgradingNames()) > 0 ||
603-
len(s.UpgradeTracker.MachinePools.UpgradingNames()) > 0 {
606+
if s.UpgradeTracker.MachineDeployments.IsAnyUpgrading() || s.UpgradeTracker.MachinePools.IsAnyUpgrading() {
604607
return *currentVersion, nil
605608
}
606609

@@ -692,6 +695,11 @@ func (g *generator) computeControlPlaneVersion(ctx context.Context, s *scope.Sco
692695
s.UpgradeTracker.ControlPlane.IsStartingUpgrade = true
693696
s.UpgradeTracker.ControlPlane.IsPendingUpgrade = false
694697

698+
log.Info(fmt.Sprintf("Control plane %s upgraded from version %s to version %s", klog.KObj(s.Current.ControlPlane.Object), *currentVersion, nextVersion),
699+
"ControlPlaneUpgrades", toUpgradeStep(s.UpgradeTracker.ControlPlane.UpgradePlan),
700+
"WorkersUpgrades", toUpgradeStep(s.UpgradeTracker.MachineDeployments.UpgradePlan, s.UpgradeTracker.MachinePools.UpgradePlan),
701+
s.Current.ControlPlane.Object.GetKind(), klog.KObj(s.Current.ControlPlane.Object),
702+
)
695703
return nextVersion, nil
696704
}
697705

@@ -857,7 +865,7 @@ func (g *generator) computeMachineDeployment(ctx context.Context, s *scope.Scope
857865
// Add ClusterTopologyMachineDeploymentLabel to the generated InfrastructureMachine template
858866
infraMachineTemplateLabels[clusterv1.ClusterTopologyMachineDeploymentNameLabel] = machineDeploymentTopology.Name
859867
desiredMachineDeployment.InfrastructureMachineTemplate.SetLabels(infraMachineTemplateLabels)
860-
version, err := g.computeMachineDeploymentVersion(s, machineDeploymentTopology, currentMachineDeployment)
868+
version, err := g.computeMachineDeploymentVersion(ctx, s, machineDeploymentTopology, currentMachineDeployment)
861869
if err != nil {
862870
return nil, err
863871
}
@@ -1039,7 +1047,9 @@ func (g *generator) computeMachineDeployment(ctx context.Context, s *scope.Scope
10391047
// computeMachineDeploymentVersion calculates the version of the desired machine deployment.
10401048
// The version is calculated using the state of the current machine deployments,
10411049
// the current control plane and the version defined in the topology.
1042-
func (g *generator) computeMachineDeploymentVersion(s *scope.Scope, machineDeploymentTopology clusterv1.MachineDeploymentTopology, currentMDState *scope.MachineDeploymentState) (string, error) {
1050+
func (g *generator) computeMachineDeploymentVersion(ctx context.Context, s *scope.Scope, machineDeploymentTopology clusterv1.MachineDeploymentTopology, currentMDState *scope.MachineDeploymentState) (string, error) {
1051+
log := ctrl.LoggerFrom(ctx)
1052+
10431053
topologyVersion := s.Blueprint.Topology.Version
10441054
// If creating a new machine deployment, mark it as pending if the control plane is not
10451055
// yet stable. Creating a new MD while the control plane is upgrading can lead to unexpected race conditions.
@@ -1111,6 +1121,12 @@ func (g *generator) computeMachineDeploymentVersion(s *scope.Scope, machineDeplo
11111121
s.UpgradeTracker.MachineDeployments.MarkUpgrading(currentMDState.Object.Name)
11121122

11131123
nextVersion := s.UpgradeTracker.MachineDeployments.UpgradePlan[0]
1124+
1125+
log.Info(fmt.Sprintf("MachineDeployment %s upgraded from version %s to version %s", klog.KObj(currentMDState.Object), currentVersion, nextVersion),
1126+
"ControlPlaneUpgrades", toUpgradeStep(s.UpgradeTracker.ControlPlane.UpgradePlan),
1127+
"WorkersUpgrades", toUpgradeStep(s.UpgradeTracker.MachineDeployments.UpgradePlan, s.UpgradeTracker.MachinePools.UpgradePlan),
1128+
"MachineDeployment", klog.KObj(currentMDState.Object),
1129+
)
11141130
return nextVersion, nil
11151131
}
11161132

@@ -1165,7 +1181,7 @@ func (g *generator) computeMachinePools(ctx context.Context, s *scope.Scope) (sc
11651181
// computeMachinePool computes the desired state for a MachinePoolTopology.
11661182
// The generated machinePool object is calculated using the values from the machinePoolTopology and
11671183
// the machinePool class.
1168-
func (g *generator) computeMachinePool(_ context.Context, s *scope.Scope, machinePoolTopology clusterv1.MachinePoolTopology) (*scope.MachinePoolState, error) {
1184+
func (g *generator) computeMachinePool(ctx context.Context, s *scope.Scope, machinePoolTopology clusterv1.MachinePoolTopology) (*scope.MachinePoolState, error) {
11691185
desiredMachinePool := &scope.MachinePoolState{}
11701186

11711187
// Gets the blueprint for the MachinePool class.
@@ -1243,7 +1259,7 @@ func (g *generator) computeMachinePool(_ context.Context, s *scope.Scope, machin
12431259
// Add ClusterTopologyMachinePoolLabel to the generated InfrastructureMachinePool object
12441260
infraMachinePoolObjectLabels[clusterv1.ClusterTopologyMachinePoolNameLabel] = machinePoolTopology.Name
12451261
desiredMachinePool.InfrastructureMachinePoolObject.SetLabels(infraMachinePoolObjectLabels)
1246-
version, err := g.computeMachinePoolVersion(s, machinePoolTopology, currentMachinePool)
1262+
version, err := g.computeMachinePoolVersion(ctx, s, machinePoolTopology, currentMachinePool)
12471263
if err != nil {
12481264
return nil, err
12491265
}
@@ -1359,7 +1375,9 @@ func (g *generator) computeMachinePool(_ context.Context, s *scope.Scope, machin
13591375
// computeMachinePoolVersion calculates the version of the desired machine pool.
13601376
// The version is calculated using the state of the current machine pools,
13611377
// the current control plane and the version defined in the topology.
1362-
func (g *generator) computeMachinePoolVersion(s *scope.Scope, machinePoolTopology clusterv1.MachinePoolTopology, currentMPState *scope.MachinePoolState) (string, error) {
1378+
func (g *generator) computeMachinePoolVersion(ctx context.Context, s *scope.Scope, machinePoolTopology clusterv1.MachinePoolTopology, currentMPState *scope.MachinePoolState) (string, error) {
1379+
log := ctrl.LoggerFrom(ctx)
1380+
13631381
topologyVersion := s.Blueprint.Topology.Version
13641382
// If creating a new machine pool, mark it as pending if the control plane is not
13651383
// yet stable. Creating a new MP while the control plane is upgrading can lead to unexpected race conditions.
@@ -1431,6 +1449,12 @@ func (g *generator) computeMachinePoolVersion(s *scope.Scope, machinePoolTopolog
14311449
s.UpgradeTracker.MachinePools.MarkUpgrading(currentMPState.Object.Name)
14321450

14331451
nextVersion := s.UpgradeTracker.MachinePools.UpgradePlan[0]
1452+
1453+
log.Info(fmt.Sprintf("MachinePool %s upgraded from version %s to version %s", klog.KObj(currentMPState.Object), currentVersion, nextVersion),
1454+
"ControlPlaneUpgrades", toUpgradeStep(s.UpgradeTracker.ControlPlane.UpgradePlan),
1455+
"WorkersUpgrades", toUpgradeStep(s.UpgradeTracker.MachineDeployments.UpgradePlan, s.UpgradeTracker.MachinePools.UpgradePlan),
1456+
"MachinePool", klog.KObj(currentMPState.Object),
1457+
)
14341458
return nextVersion, nil
14351459
}
14361460

exp/topology/desiredstate/desired_state_test.go

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,11 +1062,11 @@ func TestComputeControlPlaneVersion(t *testing.T) {
10621062

10631063
catalog := runtimecatalog.New()
10641064
_ = runtimehooksv1.AddToCatalog(catalog)
1065+
beforeClusterUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.BeforeClusterUpgrade)
1066+
beforeControlPlaneUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.BeforeControlPlaneUpgrade)
1067+
beforeWorkersUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.BeforeWorkersUpgrade)
1068+
afterWorkersUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.AfterWorkersUpgrade)
10651069

1066-
beforeClusterUpgradeGVH, err := catalog.GroupVersionHook(runtimehooksv1.BeforeClusterUpgrade)
1067-
if err != nil {
1068-
panic("unable to compute GVH")
1069-
}
10701070
nonBlockingBeforeClusterUpgradeResponse := &runtimehooksv1.BeforeClusterUpgradeResponse{
10711071
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
10721072
CommonResponse: runtimehooksv1.CommonResponse{
@@ -1090,10 +1090,6 @@ func TestComputeControlPlaneVersion(t *testing.T) {
10901090
},
10911091
}
10921092

1093-
beforeControlPlaneUpgradeGVH, err := catalog.GroupVersionHook(runtimehooksv1.BeforeControlPlaneUpgrade)
1094-
if err != nil {
1095-
panic("unable to compute GVH")
1096-
}
10971093
nonBlockingBeforeControlPlaneUpgradeResponse := &runtimehooksv1.BeforeControlPlaneUpgradeResponse{
10981094
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
10991095
CommonResponse: runtimehooksv1.CommonResponse{
@@ -1117,10 +1113,6 @@ func TestComputeControlPlaneVersion(t *testing.T) {
11171113
},
11181114
}
11191115

1120-
beforeWorkersUpgradeGVH, err := catalog.GroupVersionHook(runtimehooksv1.BeforeWorkersUpgrade)
1121-
if err != nil {
1122-
panic("unable to compute GVH")
1123-
}
11241116
nonBlockingBeforeWorkersUpgradeResponse := &runtimehooksv1.BeforeWorkersUpgradeResponse{
11251117
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
11261118
CommonResponse: runtimehooksv1.CommonResponse{
@@ -1144,10 +1136,6 @@ func TestComputeControlPlaneVersion(t *testing.T) {
11441136
},
11451137
}
11461138

1147-
afterWorkersUpgradeGVH, err := catalog.GroupVersionHook(runtimehooksv1.AfterWorkersUpgrade)
1148-
if err != nil {
1149-
panic("unable to compute GVH")
1150-
}
11511139
nonBlockingAfterWorkersUpgradeResponse := &runtimehooksv1.AfterWorkersUpgradeResponse{
11521140
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
11531141
CommonResponse: runtimehooksv1.CommonResponse{
@@ -1705,6 +1693,12 @@ func TestComputeControlPlaneVersion(t *testing.T) {
17051693

17061694
runtimeClient := fakeruntimeclient.NewRuntimeClientBuilder().
17071695
WithCatalog(catalog).
1696+
WithGetAllExtensionResponses(map[runtimecatalog.GroupVersionHook][]string{
1697+
beforeClusterUpgradeGVH: {"foo"},
1698+
beforeControlPlaneUpgradeGVH: {"foo"},
1699+
beforeWorkersUpgradeGVH: {"foo"},
1700+
afterWorkersUpgradeGVH: {"foo"},
1701+
}).
17081702
WithCallAllExtensionResponses(map[runtimecatalog.GroupVersionHook]runtimehooksv1.ResponseObject{
17091703
beforeClusterUpgradeGVH: tt.beforeClusterUpgradeResponse,
17101704
beforeControlPlaneUpgradeGVH: tt.beforeControlPlaneUpgradeResponse,
@@ -2969,7 +2963,7 @@ func TestComputeMachineDeploymentVersion(t *testing.T) {
29692963

29702964
e := generator{}
29712965

2972-
version, err := e.computeMachineDeploymentVersion(s, tt.machineDeploymentTopology, tt.currentMachineDeploymentState)
2966+
version, err := e.computeMachineDeploymentVersion(ctx, s, tt.machineDeploymentTopology, tt.currentMachineDeploymentState)
29732967
g.Expect(err).NotTo(HaveOccurred())
29742968
g.Expect(version).To(Equal(tt.expectedVersion))
29752969

@@ -3214,7 +3208,7 @@ func TestComputeMachinePoolVersion(t *testing.T) {
32143208

32153209
e := generator{}
32163210

3217-
version, err := e.computeMachinePoolVersion(s, tt.machinePoolTopology, tt.currentMachinePoolState)
3211+
version, err := e.computeMachinePoolVersion(ctx, s, tt.machinePoolTopology, tt.currentMachinePoolState)
32183212
g.Expect(err).NotTo(HaveOccurred())
32193213
g.Expect(version).To(Equal(tt.expectedVersion))
32203214

0 commit comments

Comments
 (0)