Skip to content

Commit ec96875

Browse files
committed
Wait for API pod to be healthy before registering control plane node in
instance group
1 parent 5ec4d50 commit ec96875

File tree

4 files changed

+43
-2
lines changed

4 files changed

+43
-2
lines changed

cloud/interfaces.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ type MachineGetter interface {
8686
Project() string
8787
Role() string
8888
IsControlPlane() bool
89+
IsFirstMachine() bool
90+
IsAPIServerHealthy() bool
8991
ControlPlaneGroupName() string
9092
GetInstanceID() *string
9193
GetProviderID() string

cloud/scope/machine.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/go-logr/logr"
2828

2929
"github.com/pkg/errors"
30+
3031
"golang.org/x/mod/semver"
3132
"google.golang.org/api/compute/v1"
3233
corev1 "k8s.io/api/core/v1"
@@ -47,6 +48,7 @@ type MachineScopeParams struct {
4748
ClusterGetter cloud.ClusterGetter
4849
Machine *clusterv1.Machine
4950
GCPMachine *infrav1.GCPMachine
51+
IsFirst bool
5052
}
5153

5254
// NewMachineScope creates a new MachineScope from the supplied parameters.
@@ -73,6 +75,7 @@ func NewMachineScope(params MachineScopeParams) (*MachineScope, error) {
7375
GCPMachine: params.GCPMachine,
7476
ClusterGetter: params.ClusterGetter,
7577
patchHelper: helper,
78+
IsFirst: params.IsFirst,
7679
}, nil
7780
}
7881

@@ -83,6 +86,7 @@ type MachineScope struct {
8386
ClusterGetter cloud.ClusterGetter
8487
Machine *clusterv1.Machine
8588
GCPMachine *infrav1.GCPMachine
89+
IsFirst bool
8690
}
8791

8892
// ANCHOR: MachineGetter
@@ -140,6 +144,24 @@ func (m *MachineScope) IsControlPlane() bool {
140144
return IsControlPlaneMachine(m.Machine)
141145
}
142146

147+
// IsFirstMachine returns true if the machine is the first machine in the cluster.
148+
func (m *MachineScope) IsFirstMachine() bool {
149+
return m.IsFirst
150+
}
151+
152+
// IsAPIServerHealthy returns true if the machine's API server pod is healthy.
153+
func (m *MachineScope) IsAPIServerHealthy() bool {
154+
if m.Machine.Status.V1Beta2 == nil {
155+
return false
156+
}
157+
for _, condition := range m.Machine.Status.V1Beta2.Conditions {
158+
if condition.Type == "APIServerPodHealthy" && condition.Status == "True" {
159+
return true
160+
}
161+
}
162+
return false
163+
}
164+
143165
// Role returns the machine role from the labels.
144166
func (m *MachineScope) Role() string {
145167
if IsControlPlaneMachine(m.Machine) {

cloud/services/compute/instances/reconcile.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,17 @@ func (s *Service) Reconcile(ctx context.Context) error {
8989
s.scope.SetInstanceStatus(infrav1.InstanceStatus(instance.Status))
9090

9191
if s.scope.IsControlPlane() {
92-
if err := s.registerControlPlaneInstance(ctx, instance); err != nil {
93-
return err
92+
// If the instance is part of the control plane, we need to ensure it's
93+
// registered with the instance group. We only do this if the API server is healthy or if this is the first
94+
// control plane machine. This prevents a hairpinning issue where a new control plane machine attempts to reach
95+
// the API server via a load balancer that is not yet ready. The first control plane machine is handled specially
96+
// by the kubeadm controller, so it can be added to the instance group immediately.
97+
if s.scope.IsAPIServerHealthy() || s.scope.IsFirstMachine() {
98+
if err := s.registerControlPlaneInstance(ctx, instance); err != nil {
99+
return err
100+
}
101+
} else {
102+
log.Info("Waiting for API server to be healthy before registering control plane instance in instance group")
94103
}
95104
}
96105

controllers/gcpmachine_controller.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,13 @@ func (r *GCPMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request)
178178
return ctrl.Result{}, nil
179179
}
180180

181+
// List all machines in the cluster to check if this is the first one.
182+
machineList := &clusterv1.MachineList{}
183+
if err := r.List(ctx, machineList, client.InNamespace(cluster.Namespace), client.MatchingLabels{clusterv1.ClusterNameLabel: cluster.Name}); err != nil {
184+
log.Error(err, "failed to list machines for cluster")
185+
return ctrl.Result{}, err
186+
}
187+
181188
// Create the cluster scope
182189
clusterScope, err := scope.NewClusterScope(ctx, scope.ClusterScopeParams{
183190
Client: r.Client,
@@ -194,6 +201,7 @@ func (r *GCPMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request)
194201
Machine: machine,
195202
GCPMachine: gcpMachine,
196203
ClusterGetter: clusterScope,
204+
IsFirst: len(machineList.Items) == 1,
197205
})
198206
if err != nil {
199207
return ctrl.Result{}, errors.Errorf("failed to create scope: %+v", err)

0 commit comments

Comments
 (0)