diff --git a/go-controller/pkg/clustermanager/egressip_controller.go b/go-controller/pkg/clustermanager/egressip_controller.go index 70ec7a58e2..b035fd88e1 100644 --- a/go-controller/pkg/clustermanager/egressip_controller.go +++ b/go-controller/pkg/clustermanager/egressip_controller.go @@ -36,6 +36,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" utilerrors "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/errors" + "k8s.io/apimachinery/pkg/api/meta" ) const ( @@ -211,6 +212,31 @@ func (eIPC *egressIPClusterController) executeCloudPrivateIPConfigOps(egressIPNa if cloudPrivateIPConfig.GetDeletionTimestamp() != nil && !cloudPrivateIPConfig.GetDeletionTimestamp().IsZero() { return fmt.Errorf("cloud update request failed, CloudPrivateIPConfig: %s is being deleted", cloudPrivateIPConfigName) } + + // Handle a scenario in which the object exists in a failed state by removing it if the node it was assigned to no longer exists + assignedCondition := meta.FindStatusCondition(cloudPrivateIPConfig.Status.Conditions, string(ocpcloudnetworkapi.Assigned)) + if cloudPrivateIPConfig.Status.Node != "" && assignedCondition != nil && assignedCondition.Status == metav1.ConditionFalse { + _, err := eIPC.watchFactory.GetNode(cloudPrivateIPConfig.Status.Node) + if err != nil && apierrors.IsNotFound(err) { + klog.Warningf("CloudPrivateIPConfig: %s is in Failed state (reason: %s) and node %s no longer exists, deleting to allow retry", + cloudPrivateIPConfigName, assignedCondition.Message, cloudPrivateIPConfig.Status.Node) + eIPRef := corev1.ObjectReference{ + Kind: "EgressIP", + Name: egressIPName, + } + eIPC.recorder.Eventf(&eIPRef, corev1.EventTypeWarning, "CloudAssignmentRetry", + "egress IP: %s previously failed on deleted node %s (reason: %s), will retry assignment", + egressIP, cloudPrivateIPConfig.Status.Node, assignedCondition.Message) + if err := eIPC.kube.DeleteCloudPrivateIPConfig(cloudPrivateIPConfigName); err != nil { + return fmt.Errorf("failed to delete failed CloudPrivateIPConfig: %s, err: %v", cloudPrivateIPConfigName, err) + } + + return fmt.Errorf("deleted failed CloudPrivateIPConfig: %s, will retry creation in next reconciliation", cloudPrivateIPConfigName) + } else if err != nil { + klog.Errorf("Failed to check if node %s exists for CloudPrivateIPConfig %s: %v", cloudPrivateIPConfig.Status.Node, cloudPrivateIPConfigName, err) + } + } + if op.toAdd == cloudPrivateIPConfig.Spec.Node { klog.Infof("CloudPrivateIPConfig: %s already assigned to node: %s", cloudPrivateIPConfigName, cloudPrivateIPConfig.Spec.Node) continue @@ -1671,21 +1697,21 @@ func cloudPrivateIPConfigNameToIPString(name string) string { // removePendingOps removes the existing pending CloudPrivateIPConfig operations // from the cache and returns the EgressIP object which can be re-synced given // the new assignment possibilities. -func (eIPC *egressIPClusterController) removePendingOpsAndGetResyncs(egressIPName, egressIP string) ([]*egressipv1.EgressIP, error) { +func (eIPC *egressIPClusterController) removePendingOpsAndGetResyncs(egressIPName, egressIPAddr string) ([]*egressipv1.EgressIP, error) { eIPC.pendingCloudPrivateIPConfigsMutex.Lock() defer eIPC.pendingCloudPrivateIPConfigsMutex.Unlock() ops, pending := eIPC.pendingCloudPrivateIPConfigsOps[egressIPName] if !pending { return nil, fmt.Errorf("no pending operation found for EgressIP: %s", egressIPName) } - op, exists := ops[egressIP] + op, exists := ops[egressIPAddr] if !exists { - return nil, fmt.Errorf("pending operations found for EgressIP: %s, but not for the finalized IP: %s", egressIPName, egressIP) + return nil, fmt.Errorf("pending operations found for EgressIP: %s, but not for the finalized IP: %s", egressIPName, egressIPAddr) } // Make sure we are dealing with a delete operation, since for update // operations will still need to process the add afterwards. if op.toAdd == "" && op.toDelete != "" { - delete(ops, egressIP) + delete(ops, egressIPAddr) } if len(ops) == 0 { delete(eIPC.pendingCloudPrivateIPConfigsOps, egressIPName) @@ -1703,10 +1729,15 @@ func (eIPC *egressIPClusterController) removePendingOpsAndGetResyncs(egressIPNam resyncs := make([]*egressipv1.EgressIP, 0, len(egressIPs)) for _, egressIP := range egressIPs { egressIP := *egressIP - // Do not process the egress IP object which owns the - // CloudPrivateIPConfig for which we are currently processing the - // deletion for. if egressIP.Name == egressIPName { + for _, specIP := range egressIP.Spec.EgressIPs { + // Do not process the egress IP object which owns the + // CloudPrivateIPConfig for which we are currently processing the + // deletion for unless it still has the IP in it's spec + if specIP == egressIPAddr { + resyncs = append(resyncs, &egressIP) + } + } continue } unassigned := len(egressIP.Spec.EgressIPs) - len(egressIP.Status.Items)