@@ -18,9 +18,11 @@ import (
1818
1919 corev1 "k8s.io/api/core/v1"
2020 apierrors "k8s.io/apimachinery/pkg/api/errors"
21+ "k8s.io/apimachinery/pkg/api/meta"
2122 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2223 "k8s.io/apimachinery/pkg/util/sets"
2324 "k8s.io/client-go/tools/record"
25+ "k8s.io/client-go/util/flowcontrol"
2426 "k8s.io/client-go/util/retry"
2527 "k8s.io/klog/v2"
2628 utilnet "k8s.io/utils/net"
@@ -211,6 +213,26 @@ func (eIPC *egressIPClusterController) executeCloudPrivateIPConfigOps(egressIPNa
211213 if cloudPrivateIPConfig .GetDeletionTimestamp () != nil && ! cloudPrivateIPConfig .GetDeletionTimestamp ().IsZero () {
212214 return fmt .Errorf ("cloud update request failed, CloudPrivateIPConfig: %s is being deleted" , cloudPrivateIPConfigName )
213215 }
216+
217+ // Handle a scenario in which the object exists in a failed state by removing it
218+ assignedCondition := meta .FindStatusCondition (cloudPrivateIPConfig .Status .Conditions , string (ocpcloudnetworkapi .Assigned ))
219+ if assignedCondition != nil && assignedCondition .Status == metav1 .ConditionFalse {
220+ klog .Warningf ("CloudPrivateIPConfig: %s is in Failed state (reason: %s), deleting to allow retry" , cloudPrivateIPConfigName , assignedCondition .Message )
221+ eIPRef := corev1.ObjectReference {
222+ Kind : "EgressIP" ,
223+ Name : egressIPName ,
224+ }
225+ eIPC .recorder .Eventf (& eIPRef , corev1 .EventTypeWarning , "CloudAssignmentRetry" ,
226+ "egress IP: %s previously failed on node %s (reason: %s), will retry assignment" ,
227+ egressIP , cloudPrivateIPConfig .Spec .Node , assignedCondition .Message )
228+ if err := eIPC .kube .DeleteCloudPrivateIPConfig (cloudPrivateIPConfigName ); err != nil {
229+ return fmt .Errorf ("failed to delete failed CloudPrivateIPConfig: %s, err: %v" , cloudPrivateIPConfigName , err )
230+ }
231+
232+ // Return an error to trigger retry
233+ return fmt .Errorf ("deleted failed CloudPrivateIPConfig: %s, will retry creation in next reconciliation" , cloudPrivateIPConfigName )
234+ }
235+
214236 if op .toAdd == cloudPrivateIPConfig .Spec .Node {
215237 klog .Infof ("CloudPrivateIPConfig: %s already assigned to node: %s" , cloudPrivateIPConfigName , cloudPrivateIPConfig .Spec .Node )
216238 continue
@@ -371,6 +393,8 @@ type egressIPClusterController struct {
371393 egressIPHandler * factory.Handler
372394 // cloudPrivateIPConfig events factory handler
373395 cloudPrivateIPConfigHandler * factory.Handler
396+ // cloudPrivateIPBackoff failure backoff tracker
397+ cloudPrivateIPBackoff * flowcontrol.Backoff
374398}
375399
376400func newEgressIPController (ovnClient * util.OVNClusterManagerClientset , wf * factory.WatchFactory , recorder record.EventRecorder ) * egressIPClusterController {
@@ -396,6 +420,7 @@ func newEgressIPController(ovnClient *util.OVNClusterManagerClientset, wf *facto
396420 reachabilityCheckInterval : egressIPReachabilityCheckInterval ,
397421 egressIPNodeHealthCheckPort : config .OVNKubernetesFeature .EgressIPNodeHealthCheckPort ,
398422 stopChan : make (chan struct {}),
423+ cloudPrivateIPBackoff : flowcontrol .NewBackOff (30 * time .Second , 5 * time .Minute ),
399424 }
400425 eIPC .initRetryFramework ()
401426 return eIPC
@@ -1498,18 +1523,22 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o
14981523 }
14991524 if new != nil {
15001525 newCloudPrivateIPConfig = new
1526+ assignedCondition := meta .FindStatusCondition (newCloudPrivateIPConfig .Status .Conditions , string (ocpcloudnetworkapi .Assigned ))
1527+
1528+ if assignedCondition != nil && assignedCondition .Status == metav1 .ConditionFalse {
1529+ return eIPC .handleFailedCloudPrivateIPConfig (oldCloudPrivateIPConfig , assignedCondition )
1530+ }
1531+
15011532 // We should only proceed to setting things up for objects where the new
15021533 // object has the same .spec.node and .status.node, and assignment
15031534 // condition being true. This is how the cloud-network-config-controller
15041535 // indicates a successful cloud assignment.
1505- shouldAdd = newCloudPrivateIPConfig .Status .Node == newCloudPrivateIPConfig .Spec .Node &&
1506- ocpcloudnetworkapi .CloudPrivateIPConfigConditionType (newCloudPrivateIPConfig .Status .Conditions [0 ].Type ) == ocpcloudnetworkapi .Assigned &&
1507- corev1 .ConditionStatus (newCloudPrivateIPConfig .Status .Conditions [0 ].Status ) == corev1 .ConditionTrue
1536+ shouldAdd = newCloudPrivateIPConfig .Status .Node == newCloudPrivateIPConfig .Spec .Node && assignedCondition != nil && assignedCondition .Status == metav1 .ConditionTrue
15081537 // See above explanation for the delete
15091538 shouldDelete = shouldDelete &&
15101539 (newCloudPrivateIPConfig .Status .Node == "" || newCloudPrivateIPConfig .Status .Node != oldCloudPrivateIPConfig .Status .Node ) &&
1511- ocpcloudnetworkapi . CloudPrivateIPConfigConditionType ( newCloudPrivateIPConfig . Status . Conditions [ 0 ]. Type ) == ocpcloudnetworkapi . Assigned &&
1512- corev1 . ConditionStatus ( newCloudPrivateIPConfig . Status . Conditions [ 0 ]. Status ) == corev1 . ConditionTrue
1540+ assignedCondition != nil && assignedCondition . Status == metav1 . ConditionTrue
1541+
15131542 // On UPDATE we need to delete the old .status.node
15141543 if shouldDelete {
15151544 nodeToDelete = oldCloudPrivateIPConfig .Status .Node
@@ -1570,7 +1599,7 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o
15701599 }
15711600 for _ , resyncEgressIP := range resyncEgressIPs {
15721601 if err := eIPC .reconcileEgressIP (nil , resyncEgressIP ); err != nil {
1573- return fmt .Errorf ("synthetic update for EgressIP: %s failed, err: %v" , egressIP .Name , err )
1602+ return fmt .Errorf ("synthetic update for EgressIP: %s failed, err: %v" , resyncEgressIP .Name , err )
15741603 }
15751604 }
15761605 }
@@ -1587,11 +1616,15 @@ func (eIPC *egressIPClusterController) reconcileCloudPrivateIPConfig(old, new *o
15871616 oldCloudPrivateIPConfig .Name )
15881617 return nil
15891618 }
1619+
1620+ // Reset backoff on successful assignment for this specific IP
1621+ egressIPString := cloudPrivateIPConfigNameToIPString (newCloudPrivateIPConfig .Name )
1622+ eIPC .cloudPrivateIPBackoff .Reset (egressIPString )
1623+
15901624 egressIP , err := eIPC .kube .GetEgressIP (egressIPName )
15911625 if err != nil {
15921626 return err
15931627 }
1594- egressIPString := cloudPrivateIPConfigNameToIPString (newCloudPrivateIPConfig .Name )
15951628 statusItem := egressipv1.EgressIPStatusItem {
15961629 Node : newCloudPrivateIPConfig .Status .Node ,
15971630 EgressIP : egressIPString ,
@@ -1668,24 +1701,63 @@ func cloudPrivateIPConfigNameToIPString(name string) string {
16681701 return net .ParseIP (name ).String ()
16691702}
16701703
1704+ // handleFailedCloudPrivateIPConfig handles CPIC assignment failures by
1705+ // scheduling EgressIP retry with exponential backoff
1706+ func (eIPC * egressIPClusterController ) handleFailedCloudPrivateIPConfig (
1707+ cloudPrivateIPConfig * ocpcloudnetworkapi.CloudPrivateIPConfig ,
1708+ failureCondition * metav1.Condition ) error {
1709+
1710+ egressIPName , exists := cloudPrivateIPConfig .Annotations [util .OVNEgressIPOwnerRefLabel ]
1711+ if ! exists {
1712+ klog .Warningf ("CloudPrivateIPConfig %q is missing egress IP owner reference annotation upon failure" ,
1713+ cloudPrivateIPConfig .Name )
1714+ return nil
1715+ }
1716+
1717+ klog .Warningf ("CloudPrivateIPConfig %s for EgressIP %s failed (reason: %s), scheduling retry with backoff" ,
1718+ cloudPrivateIPConfig .Name , egressIPName , failureCondition .Message )
1719+
1720+ // Get the EgressIP object
1721+ egressIP , err := eIPC .kube .GetEgressIP (egressIPName )
1722+ if apierrors .IsNotFound (err ) {
1723+ // EgressIP was deleted, nothing to retry
1724+ klog .V (5 ).Infof ("EgressIP %s no longer exists, skipping retry" , egressIPName )
1725+ return nil
1726+ }
1727+ if err != nil {
1728+ return err
1729+ }
1730+
1731+ egressIPAddr := cloudPrivateIPConfigNameToIPString (cloudPrivateIPConfig .Name )
1732+ eIPC .cloudPrivateIPBackoff .Next (egressIPAddr , eIPC .cloudPrivateIPBackoff .Clock .Now ())
1733+ backoff := eIPC .cloudPrivateIPBackoff .Get (egressIPAddr )
1734+ klog .Infof ("Scheduling EgressIP %s retry for IP %s with backoff %v after CPIC failure" ,
1735+ egressIPName , egressIPAddr , backoff )
1736+ if err := eIPC .retryEgressIPs .AddRetryObjWithAddBackoff (egressIP , backoff ); err != nil {
1737+ return fmt .Errorf ("failed to add EgressIP %s to retry framework: %v" , egressIPName , err )
1738+ }
1739+
1740+ return nil
1741+ }
1742+
16711743// removePendingOps removes the existing pending CloudPrivateIPConfig operations
16721744// from the cache and returns the EgressIP object which can be re-synced given
16731745// the new assignment possibilities.
1674- func (eIPC * egressIPClusterController ) removePendingOpsAndGetResyncs (egressIPName , egressIP string ) ([]* egressipv1.EgressIP , error ) {
1746+ func (eIPC * egressIPClusterController ) removePendingOpsAndGetResyncs (egressIPName , egressIPAddr string ) ([]* egressipv1.EgressIP , error ) {
16751747 eIPC .pendingCloudPrivateIPConfigsMutex .Lock ()
16761748 defer eIPC .pendingCloudPrivateIPConfigsMutex .Unlock ()
16771749 ops , pending := eIPC .pendingCloudPrivateIPConfigsOps [egressIPName ]
16781750 if ! pending {
16791751 return nil , fmt .Errorf ("no pending operation found for EgressIP: %s" , egressIPName )
16801752 }
1681- op , exists := ops [egressIP ]
1753+ op , exists := ops [egressIPAddr ]
16821754 if ! exists {
1683- return nil , fmt .Errorf ("pending operations found for EgressIP: %s, but not for the finalized IP: %s" , egressIPName , egressIP )
1755+ return nil , fmt .Errorf ("pending operations found for EgressIP: %s, but not for the finalized IP: %s" , egressIPName , egressIPAddr )
16841756 }
16851757 // Make sure we are dealing with a delete operation, since for update
16861758 // operations will still need to process the add afterwards.
16871759 if op .toAdd == "" && op .toDelete != "" {
1688- delete (ops , egressIP )
1760+ delete (ops , egressIPAddr )
16891761 }
16901762 if len (ops ) == 0 {
16911763 delete (eIPC .pendingCloudPrivateIPConfigsOps , egressIPName )
@@ -1707,7 +1779,12 @@ func (eIPC *egressIPClusterController) removePendingOpsAndGetResyncs(egressIPNam
17071779 // CloudPrivateIPConfig for which we are currently processing the
17081780 // deletion for.
17091781 if egressIP .Name == egressIPName {
1710- continue
1782+ for _ , specIP := range egressIP .Spec .EgressIPs {
1783+ // if the EgressIP object still has the IP that got removed in the spec it likely wants it assigned back so retry
1784+ if specIP == egressIPAddr {
1785+ resyncs = append (resyncs , & egressIP )
1786+ }
1787+ }
17111788 }
17121789 unassigned := len (egressIP .Spec .EgressIPs ) - len (egressIP .Status .Items )
17131790 ops , pending := eIPC .pendingCloudPrivateIPConfigsOps [egressIP .Name ]
0 commit comments