Skip to content

Commit

Permalink
Ensure last machine removed does not wait for hook
Browse files Browse the repository at this point in the history
When the last machine is in a deleting state, this means that cluster is
removed also. In such scenario, waiting for draining is not feasible,
because it is performes only when node deletion is allowed. Which is
not, due to cluster removal. Cluster API prevents draining with the
"cluster is being deleted" error.

Signed-off-by: Danil-Grigorev <[email protected]>
  • Loading branch information
Danil-Grigorev committed Sep 11, 2024
1 parent 452143d commit 47e23e5
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 14 deletions.
20 changes: 19 additions & 1 deletion controlplane/internal/controllers/rke2controlplane_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -720,14 +720,30 @@ func (r *RKE2ControlPlaneReconciler) reconcileDelete(ctx context.Context,
}

// Delete control plane machines in parallel
machinesToDelete := ownedMachines.Filter(collections.Not(collections.HasDeletionTimestamp))
machinesToDelete := ownedMachines

var errs []error

for i := range machinesToDelete {
m := machinesToDelete[i]
logger := logger.WithValues("machine", m)

// During RKE2CP deletion we don't care about forwarding etcd leadership or removing etcd members.
// So we are removing the pre-terminate hook.
// This is important because when deleting KCP we will delete all members of etcd and it's not possible
// to forward etcd leadership without any member left after we went through the Machine deletion.
// Also in this case the reconcileDelete code of the Machine controller won't execute Node drain
// and wait for volume detach.
if err := r.removePreTerminateHookAnnotationFromMachine(ctx, m); err != nil {
errs = append(errs, err)
continue

Check failure on line 739 in controlplane/internal/controllers/rke2controlplane_controller.go

View workflow job for this annotation

GitHub Actions / lint

continue with no blank line before (nlreturn)

Check failure on line 739 in controlplane/internal/controllers/rke2controlplane_controller.go

View workflow job for this annotation

GitHub Actions / lint

continue with no blank line before (nlreturn)

Check failure on line 739 in controlplane/internal/controllers/rke2controlplane_controller.go

View workflow job for this annotation

GitHub Actions / lint

continue with no blank line before (nlreturn)

Check failure on line 739 in controlplane/internal/controllers/rke2controlplane_controller.go

View workflow job for this annotation

GitHub Actions / lint

continue with no blank line before (nlreturn)
}

if !m.DeletionTimestamp.IsZero() {
// Nothing to do, Machine already has deletionTimestamp set.
continue
}

if err := r.Client.Delete(ctx, machinesToDelete[i]); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Failed to cleanup owned machine")
errs = append(errs, err)
Expand All @@ -742,6 +758,8 @@ func (r *RKE2ControlPlaneReconciler) reconcileDelete(ctx context.Context,
return ctrl.Result{}, err
}

logger.Info("Waiting for control plane Machines to not exist anymore")

conditions.MarkFalse(rcp, controlplanev1.ResizedCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")

return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil
Expand Down
18 changes: 5 additions & 13 deletions controlplane/internal/controllers/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,19 +155,6 @@ func (r *RKE2ControlPlaneReconciler) scaleDownControlPlane(

// If etcd leadership is on machine that is about to be deleted, move it to the newest member available.
etcdLeaderCandidate := controlPlane.Machines.Newest()
// Removing last memember of CP machines
if etcdLeaderCandidate == nil || !etcdLeaderCandidate.DeletionTimestamp.IsZero() {
// During complete RKE2 deletion we don't care about forwarding etcd leadership or removing etcd members.
// So we are removing the pre-terminate hook.
// This is important because when deleting RKE2 we will delete all members of etcd and it's not possible
// to forward etcd leadership without any member left after we went through the Machine deletion.
// Also in this case the reconcileDelete code of the Machine controller won't execute Node drain
// and wait for volume detach.
if err := r.removePreTerminateHookAnnotationFromMachine(ctx, machineToDelete); err != nil {
return ctrl.Result{}, err
}
}

if err := r.workloadCluster.ForwardEtcdLeadership(ctx, machineToDelete, etcdLeaderCandidate); err != nil {
logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name)

Expand All @@ -190,6 +177,11 @@ func (r *RKE2ControlPlaneReconciler) scaleDownControlPlane(
}

func (r *RKE2ControlPlaneReconciler) removePreTerminateHookAnnotationFromMachine(ctx context.Context, machine *clusterv1.Machine) error {
if _, exists := machine.Annotations[controlplanev1.PreTerminateHookCleanupAnnotation]; !exists {
// Nothing to do, the annotation is not set (anymore) on the Machine
return nil
}

log := ctrl.LoggerFrom(ctx)
log.Info("Removing pre-terminate hook from control plane Machine")

Expand Down

0 comments on commit 47e23e5

Please sign in to comment.