Skip to content

Commit 6a7d141

Browse files
committed
Handle failed CloudPrivateIPConfig assignments
When a CPIC is created and the cloud controller externally sets its status to Failed, ovn-kubernetes would not clean up or retry the assignment. Reproducer: 1. CPIC assignment fails 2. The EgressIP status never reflects it as it ignores non-sucessfull assignments 3. Later reconciliation attempts trying to create the same CPIC again 4. Creation failures due to "already exists" errors To fix the issue remove failed CPICs druing CPIC add reconciliation. Signed-off-by: Patryk Diak <[email protected]>
1 parent 627ddba commit 6a7d141

File tree

2 files changed

+102
-12
lines changed

2 files changed

+102
-12
lines changed

go-controller/pkg/clustermanager/egressip_controller.go

Lines changed: 89 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ import (
1818

1919
corev1 "k8s.io/api/core/v1"
2020
apierrors "k8s.io/apimachinery/pkg/api/errors"
21+
"k8s.io/apimachinery/pkg/api/meta"
2122
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2223
"k8s.io/apimachinery/pkg/util/sets"
2324
"k8s.io/client-go/tools/record"
25+
"k8s.io/client-go/util/flowcontrol"
2426
"k8s.io/client-go/util/retry"
2527
"k8s.io/klog/v2"
2628
utilnet "k8s.io/utils/net"
@@ -211,6 +213,26 @@ func (eIPC *egressIPClusterController) executeCloudPrivateIPConfigOps(egressIPNa
211213
if cloudPrivateIPConfig.GetDeletionTimestamp() != nil && !cloudPrivateIPConfig.GetDeletionTimestamp().IsZero() {
212214
return fmt.Errorf("cloud update request failed, CloudPrivateIPConfig: %s is being deleted", cloudPrivateIPConfigName)
213215
}
216+
217+
// Handle a scenario in which the object exists in a failed state by removing it
218+
assignedCondition := meta.FindStatusCondition(cloudPrivateIPConfig.Status.Conditions, string(ocpcloudnetworkapi.Assigned))
219+
if assignedCondition != nil && assignedCondition.Status == metav1.ConditionFalse {
220+
klog.Warningf("CloudPrivateIPConfig: %s is in Failed state (reason: %s), deleting to allow retry", cloudPrivateIPConfigName, assignedCondition.Message)
221+
eIPRef := corev1.ObjectReference{
222+
Kind: "EgressIP",
223+
Name: egressIPName,
224+
}
225+
eIPC.recorder.Eventf(&eIPRef, corev1.EventTypeWarning, "CloudAssignmentRetry",
226+
"egress IP: %s previously failed on node %s (reason: %s), will retry assignment",
227+
egressIP, cloudPrivateIPConfig.Spec.Node, assignedCondition.Message)
228+
if err := eIPC.kube.DeleteCloudPrivateIPConfig(cloudPrivateIPConfigName); err != nil {
229+
return fmt.Errorf("failed to delete failed CloudPrivateIPConfig: %s, err: %v", cloudPrivateIPConfigName, err)
230+
}
231+
232+
// Return an error to trigger retry
233+
return fmt.Errorf("deleted failed CloudPrivateIPConfig: %s, will retry creation in next reconciliation", cloudPrivateIPConfigName)
234+
}
235+
214236
if op.toAdd == cloudPrivateIPConfig.Spec.Node {
215237
klog.Infof("CloudPrivateIPConfig: %s already assigned to node: %s", cloudPrivateIPConfigName, cloudPrivateIPConfig.Spec.Node)
216238
continue
@@ -371,6 +393,8 @@ type egressIPClusterController struct {
371393
egressIPHandler *factory.Handler
372394
// cloudPrivateIPConfig events factory handler
373395
cloudPrivateIPConfigHandler *factory.Handler
396+
// cloudPrivateIPBackoff failure backoff tracker
397+
cloudPrivateIPBackoff *flowcontrol.Backoff
374398
}
375399

376400
func newEgressIPController(ovnClient *util.OVNClusterManagerClientset, wf *factory.WatchFactory, recorder record.EventRecorder) *egressIPClusterController {
@@ -396,6 +420,7 @@ func newEgressIPController(ovnClient *util.OVNClusterManagerClientset, wf *facto
396420
reachabilityCheckInterval: egressIPReachabilityCheckInterval,
397421
egressIPNodeHealthCheckPort: config.OVNKubernetesFeature.EgressIPNodeHealthCheckPort,
398422
stopChan: make(chan struct{}),
423+
cloudPrivateIPBackoff: flowcontrol.NewBackOff(30*time.Second, 5*time.Minute),
399424
}
400425
eIPC.initRetryFramework()
401426
return eIPC
@@ -1498,18 +1523,22 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o
14981523
}
14991524
if new != nil {
15001525
newCloudPrivateIPConfig = new
1526+
assignedCondition := meta.FindStatusCondition(newCloudPrivateIPConfig.Status.Conditions, string(ocpcloudnetworkapi.Assigned))
1527+
1528+
if assignedCondition != nil && assignedCondition.Status == metav1.ConditionFalse {
1529+
return eIPC.handleFailedCloudPrivateIPConfig(oldCloudPrivateIPConfig, assignedCondition)
1530+
}
1531+
15011532
// We should only proceed to setting things up for objects where the new
15021533
// object has the same .spec.node and .status.node, and assignment
15031534
// condition being true. This is how the cloud-network-config-controller
15041535
// indicates a successful cloud assignment.
1505-
shouldAdd = newCloudPrivateIPConfig.Status.Node == newCloudPrivateIPConfig.Spec.Node &&
1506-
ocpcloudnetworkapi.CloudPrivateIPConfigConditionType(newCloudPrivateIPConfig.Status.Conditions[0].Type) == ocpcloudnetworkapi.Assigned &&
1507-
corev1.ConditionStatus(newCloudPrivateIPConfig.Status.Conditions[0].Status) == corev1.ConditionTrue
1536+
shouldAdd = newCloudPrivateIPConfig.Status.Node == newCloudPrivateIPConfig.Spec.Node && assignedCondition != nil && assignedCondition.Status == metav1.ConditionTrue
15081537
// See above explanation for the delete
15091538
shouldDelete = shouldDelete &&
15101539
(newCloudPrivateIPConfig.Status.Node == "" || newCloudPrivateIPConfig.Status.Node != oldCloudPrivateIPConfig.Status.Node) &&
1511-
ocpcloudnetworkapi.CloudPrivateIPConfigConditionType(newCloudPrivateIPConfig.Status.Conditions[0].Type) == ocpcloudnetworkapi.Assigned &&
1512-
corev1.ConditionStatus(newCloudPrivateIPConfig.Status.Conditions[0].Status) == corev1.ConditionTrue
1540+
assignedCondition != nil && assignedCondition.Status == metav1.ConditionTrue
1541+
15131542
// On UPDATE we need to delete the old .status.node
15141543
if shouldDelete {
15151544
nodeToDelete = oldCloudPrivateIPConfig.Status.Node
@@ -1570,7 +1599,7 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o
15701599
}
15711600
for _, resyncEgressIP := range resyncEgressIPs {
15721601
if err := eIPC.reconcileEgressIP(nil, resyncEgressIP); err != nil {
1573-
return fmt.Errorf("synthetic update for EgressIP: %s failed, err: %v", egressIP.Name, err)
1602+
return fmt.Errorf("synthetic update for EgressIP: %s failed, err: %v", resyncEgressIP.Name, err)
15741603
}
15751604
}
15761605
}
@@ -1587,11 +1616,15 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o
15871616
oldCloudPrivateIPConfig.Name)
15881617
return nil
15891618
}
1619+
1620+
// Reset backoff on successful assignment for this specific IP
1621+
egressIPString := cloudPrivateIPConfigNameToIPString(newCloudPrivateIPConfig.Name)
1622+
eIPC.cloudPrivateIPBackoff.Reset(egressIPString)
1623+
15901624
egressIP, err := eIPC.kube.GetEgressIP(egressIPName)
15911625
if err != nil {
15921626
return err
15931627
}
1594-
egressIPString := cloudPrivateIPConfigNameToIPString(newCloudPrivateIPConfig.Name)
15951628
statusItem := egressipv1.EgressIPStatusItem{
15961629
Node: newCloudPrivateIPConfig.Status.Node,
15971630
EgressIP: egressIPString,
@@ -1668,24 +1701,63 @@ func cloudPrivateIPConfigNameToIPString(name string) string {
16681701
return net.ParseIP(name).String()
16691702
}
16701703

1704+
// handleFailedCloudPrivateIPConfig handles CPIC assignment failures by
1705+
// scheduling EgressIP retry with exponential backoff
1706+
func (eIPC *egressIPClusterController) handleFailedCloudPrivateIPConfig(
1707+
cloudPrivateIPConfig *ocpcloudnetworkapi.CloudPrivateIPConfig,
1708+
failureCondition *metav1.Condition) error {
1709+
1710+
egressIPName, exists := cloudPrivateIPConfig.Annotations[util.OVNEgressIPOwnerRefLabel]
1711+
if !exists {
1712+
klog.Warningf("CloudPrivateIPConfig %q is missing egress IP owner reference annotation upon failure",
1713+
cloudPrivateIPConfig.Name)
1714+
return nil
1715+
}
1716+
1717+
klog.Warningf("CloudPrivateIPConfig %s for EgressIP %s failed (reason: %s), scheduling retry with backoff",
1718+
cloudPrivateIPConfig.Name, egressIPName, failureCondition.Message)
1719+
1720+
// Get the EgressIP object
1721+
egressIP, err := eIPC.kube.GetEgressIP(egressIPName)
1722+
if apierrors.IsNotFound(err) {
1723+
// EgressIP was deleted, nothing to retry
1724+
klog.V(5).Infof("EgressIP %s no longer exists, skipping retry", egressIPName)
1725+
return nil
1726+
}
1727+
if err != nil {
1728+
return err
1729+
}
1730+
1731+
egressIPAddr := cloudPrivateIPConfigNameToIPString(cloudPrivateIPConfig.Name)
1732+
eIPC.cloudPrivateIPBackoff.Next(egressIPAddr, eIPC.cloudPrivateIPBackoff.Clock.Now())
1733+
backoff := eIPC.cloudPrivateIPBackoff.Get(egressIPAddr)
1734+
klog.Infof("Scheduling EgressIP %s retry for IP %s with backoff %v after CPIC failure",
1735+
egressIPName, egressIPAddr, backoff)
1736+
if err := eIPC.retryEgressIPs.AddRetryObjWithAddBackoff(egressIP, backoff); err != nil {
1737+
return fmt.Errorf("failed to add EgressIP %s to retry framework: %v", egressIPName, err)
1738+
}
1739+
1740+
return nil
1741+
}
1742+
16711743
// removePendingOps removes the existing pending CloudPrivateIPConfig operations
16721744
// from the cache and returns the EgressIP object which can be re-synced given
16731745
// the new assignment possibilities.
1674-
func (eIPC *egressIPClusterController) removePendingOpsAndGetResyncs(egressIPName, egressIP string) ([]*egressipv1.EgressIP, error) {
1746+
func (eIPC *egressIPClusterController) removePendingOpsAndGetResyncs(egressIPName, egressIPAddr string) ([]*egressipv1.EgressIP, error) {
16751747
eIPC.pendingCloudPrivateIPConfigsMutex.Lock()
16761748
defer eIPC.pendingCloudPrivateIPConfigsMutex.Unlock()
16771749
ops, pending := eIPC.pendingCloudPrivateIPConfigsOps[egressIPName]
16781750
if !pending {
16791751
return nil, fmt.Errorf("no pending operation found for EgressIP: %s", egressIPName)
16801752
}
1681-
op, exists := ops[egressIP]
1753+
op, exists := ops[egressIPAddr]
16821754
if !exists {
1683-
return nil, fmt.Errorf("pending operations found for EgressIP: %s, but not for the finalized IP: %s", egressIPName, egressIP)
1755+
return nil, fmt.Errorf("pending operations found for EgressIP: %s, but not for the finalized IP: %s", egressIPName, egressIPAddr)
16841756
}
16851757
// Make sure we are dealing with a delete operation, since for update
16861758
// operations will still need to process the add afterwards.
16871759
if op.toAdd == "" && op.toDelete != "" {
1688-
delete(ops, egressIP)
1760+
delete(ops, egressIPAddr)
16891761
}
16901762
if len(ops) == 0 {
16911763
delete(eIPC.pendingCloudPrivateIPConfigsOps, egressIPName)
@@ -1707,7 +1779,12 @@ func (eIPC *egressIPClusterController) removePendingOpsAndGetResyncs(egressIPNam
17071779
// CloudPrivateIPConfig for which we are currently processing the
17081780
// deletion for.
17091781
if egressIP.Name == egressIPName {
1710-
continue
1782+
for _, specIP := range egressIP.Spec.EgressIPs {
1783+
// if the EgressIP object still has the IP that got removed in the spec it likely wants it assigned back so retry
1784+
if specIP == egressIPAddr {
1785+
resyncs = append(resyncs, &egressIP)
1786+
}
1787+
}
17111788
}
17121789
unassigned := len(egressIP.Spec.EgressIPs) - len(egressIP.Status.Items)
17131790
ops, pending := eIPC.pendingCloudPrivateIPConfigsOps[egressIP.Name]

go-controller/pkg/retry/obj_retry.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,19 @@ func (r *RetryFramework) AddRetryObjWithAddNoBackoff(obj interface{}) error {
201201
return nil
202202
}
203203

204+
// AddRetryObjWithAddBackoff adds an object to be retried for add with a custom backoff.
205+
// It will lock the key, create or update retryObject, and unlock the key
206+
func (r *RetryFramework) AddRetryObjWithAddBackoff(obj interface{}, backoff time.Duration) error {
207+
key, err := GetResourceKey(obj)
208+
if err != nil {
209+
return fmt.Errorf("could not get the key of %s %v: %v", r.ResourceHandler.ObjType, obj, err)
210+
}
211+
r.DoWithLock(key, func(key string) {
212+
r.initRetryObjWithAddBackoff(obj, key, backoff)
213+
})
214+
return nil
215+
}
216+
204217
func (r *RetryFramework) getRetryObj(lockedKey string) (value *retryObjEntry, found bool) {
205218
return r.retryEntries.Load(lockedKey)
206219
}

0 commit comments

Comments
 (0)