Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 25 additions & 3 deletions pkg/destroy/gcp/address.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,43 @@ const (
regionalAddressResource = "regionaddress"
)

func (o *ClusterUninstaller) deleteAddressByName(ctx context.Context, resourceName, location string) error {
typeName := globalAddressResource
if location != string(global) {
typeName = regionalAddressResource
}
items, err := o.listAddressesWithFilter(ctx, typeName, "items(name,labels),nextPageToken", func(item *compute.Address) bool {
// Address should have labels, but none observed
return item.Name == resourceName
})
if err != nil {
return fmt.Errorf("failed to list address by name : %w", err)
}
for _, item := range items {
if err := o.deleteAddress(ctx, item); err != nil {
return fmt.Errorf("failed to delete address by name: %w", err)
}
}
return nil
}

func (o *ClusterUninstaller) listAddresses(ctx context.Context, typeName string) ([]cloudResource, error) {
return o.listAddressesWithFilter(ctx, typeName, "items(name,region,addressType),nextPageToken", o.isClusterResource)
return o.listAddressesWithFilter(ctx, typeName, "items(name,region,addressType,labels),nextPageToken", func(item *compute.Address) bool {
return o.isClusterResource(item.Name) && !o.isSharedResource(item.Labels)
})
}

// listAddressesWithFilter lists addresses in the project that satisfy the filter criteria.
// The fields parameter specifies which fields should be returned in the result, the filter string contains
// a filter string passed to the API to filter results.
func (o *ClusterUninstaller) listAddressesWithFilter(ctx context.Context, typeName, fields string, filterFunc resourceFilterFunc) ([]cloudResource, error) {
func (o *ClusterUninstaller) listAddressesWithFilter(ctx context.Context, typeName, fields string, filterFunc func(item *compute.Address) bool) ([]cloudResource, error) {
o.Logger.Debugf("Listing addresses")
result := []cloudResource{}

pagesFunc := func(list *compute.AddressList) error {
for _, item := range list.Items {
o.Logger.Debugf("Found address (%s): %s", typeName, item.Name)
if filterFunc(item.Name) {
if filterFunc(item) {
var quota []gcp.QuotaUsage
if item.AddressType == "INTERNAL" {
quota = []gcp.QuotaUsage{{
Expand Down
19 changes: 19 additions & 0 deletions pkg/destroy/gcp/backendservice.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,25 @@ const (
regionBackendServiceResource = "regionbackendservice"
)

func (o *ClusterUninstaller) deleteBackendServiceByName(ctx context.Context, resourceName, location string) error {
typeName := globalBackendServiceResource
if location != string(global) {
typeName = regionBackendServiceResource
}
items, err := o.listBackendServicesWithFilter(ctx, typeName, "items(name),nextPageToken", func(item *compute.BackendService) bool {
return item.Name == resourceName
})
if err != nil {
return fmt.Errorf("failed to list backend services by name : %w", err)
}
for _, item := range items {
if err := o.deleteBackendService(ctx, item); err != nil {
return fmt.Errorf("failed to delete backend service by name: %w", err)
}
}
return nil
}

func (o *ClusterUninstaller) listBackendServices(ctx context.Context, typeName string) ([]cloudResource, error) {
return o.listBackendServicesWithFilter(ctx, typeName, "items(name),nextPageToken",
func(item *compute.BackendService) bool {
Expand Down
18 changes: 10 additions & 8 deletions pkg/destroy/gcp/cloudcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ func (o *ClusterUninstaller) listCloudControllerBackendServices(ctx context.Cont

fwList, err := o.computeSvc.Firewalls.List(o.ProjectID).Fields(googleapi.Field("items(name,targetTags),nextPageToken")).Context(ctx).Do()
if err != nil {
o.Logger.Debugf("failed to list firewall rules associated with backend service %s: %v", item.Name, err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably still want to keep this log?

return false
}
for _, fw := range fwList.Items {
Expand Down Expand Up @@ -103,9 +102,7 @@ func (o *ClusterUninstaller) listCloudControllerTargetPools(ctx context.Context,
break
}
}

if !foundClusterResource {
o.Logger.Debugf("Skipping target pool instance %s because it is not a cluster resource", item.Name)
return false
}
}
Expand All @@ -121,7 +118,10 @@ func (o *ClusterUninstaller) discoverCloudControllerLoadBalancerResources(ctx co
loadBalancerFilterFunc := o.createLoadBalancerFilterFunc(loadBalancerName)

// Discover associated addresses: loadBalancerName
found, err := o.listAddressesWithFilter(ctx, regionalAddressResource, "items(name),nextPageToken", loadBalancerFilterFunc)
found, err := o.listAddressesWithFilter(ctx, regionalAddressResource, "items(name),nextPageToken", func(item *compute.Address) bool {
// address should have labels but none observed
return strings.HasPrefix(item.Name, loadBalancerName)
})
Comment on lines +122 to +124
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has the same check as loadBalancerFilterFunc right? Can we keep the original implementation? We can still keep the comment though.

if err != nil {
return err
}
Expand Down Expand Up @@ -160,7 +160,12 @@ func (o *ClusterUninstaller) discoverCloudControllerLoadBalancerResources(ctx co
o.insertPendingItems(firewallResourceName, found)

// Discover associated forwarding rules: loadBalancerName
found, err = o.listForwardingRulesWithFilter(ctx, regionForwardingRuleResource, "items(name),nextPageToken", loadBalancerFilterFunc)
found, err = o.listForwardingRulesWithFilter(ctx, regionForwardingRuleResource, "items(name,labels),nextPageToken", func(item *compute.ForwardingRule) bool {
// The forwarding rule should be checked for labels to ensure that
// it is owned and not shared. However, the forwarding rules associated with
// load balancers do not have these labels.
return strings.HasPrefix(item.Name, loadBalancerName)
})
Comment on lines +164 to +168
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question as above: This has the same check as loadBalancerFilterFunc right? We can still keep the comment though.

if err != nil {
return err
}
Expand Down Expand Up @@ -203,7 +208,6 @@ func (o *ClusterUninstaller) discoverCloudControllerLoadBalancerResources(ctx co
// For each of those backend services, resources like forwarding rules, firewalls, health checks and
// backend services are added to pendingItems
func (o *ClusterUninstaller) discoverCloudControllerResources(ctx context.Context) error {
o.Logger.Debugf("Discovering cloud controller resources")
errs := []error{}

// Instance group related items
Expand All @@ -222,7 +226,6 @@ func (o *ClusterUninstaller) discoverCloudControllerResources(ctx context.Contex
return err
}
for _, backend := range backends {
o.Logger.Debugf("Discovering cloud controller resources for %s", backend.name)
err := o.discoverCloudControllerLoadBalancerResources(ctx, backend.name)
if err != nil {
errs = append(errs, err)
Expand All @@ -244,7 +247,6 @@ func (o *ClusterUninstaller) discoverCloudControllerResources(ctx context.Contex
return err
}
for _, pool := range pools {
o.Logger.Debugf("Discovering cloud controller resources for %s", pool.name)
err := o.discoverCloudControllerLoadBalancerResources(ctx, pool.name)
if err != nil {
errs = append(errs, err)
Expand Down
14 changes: 14 additions & 0 deletions pkg/destroy/gcp/firewall.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package gcp

import (
"context"
"fmt"
"strings"

"github.com/pkg/errors"
Expand All @@ -15,6 +16,19 @@ const (
firewallResourceName = "firewall"
)

func (o *ClusterUninstaller) deleteFirewallByName(ctx context.Context, resourceName string) error {
items, err := o.listFirewallsWithFilter(ctx, "items(name),nextPageToken", func(item string) bool { return strings.Contains(item, resourceName) })
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question as above: should we make it consistent as currently we use both == and .Contains in different places?

if err != nil {
return fmt.Errorf("failed to list firewall by name: %w", err)
}
for _, item := range items {
if err := o.deleteFirewall(ctx, item); err != nil {
return fmt.Errorf("failed to delete firewall by name: %w", err)
}
}
return nil
}

func (o *ClusterUninstaller) listFirewalls(ctx context.Context) ([]cloudResource, error) {
// The firewall rules that the destroyer is searching for here include a
// pattern before and after the cluster ID. Use a regular expression that allows
Expand Down
27 changes: 24 additions & 3 deletions pkg/destroy/gcp/forwardingrule.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,36 @@ const (
regionForwardingRuleResource = "regionalforwardingrule"
)

func (o *ClusterUninstaller) deleteForwardingRuleByName(ctx context.Context, resourceName, location string) error {
typeName := regionForwardingRuleResource
if location == string(global) {
typeName = globalForwardingRuleResource
}
items, err := o.listForwardingRulesWithFilter(ctx, typeName, "items(name,labels),nextPageToken", func(item *compute.ForwardingRule) bool {
return item.Name == resourceName && o.isOwnedResource(item.Labels)
})
if err != nil {
return fmt.Errorf("failed to list forwarding rules by name: %w", err)
}
for _, item := range items {
if err := o.deleteForwardingRule(ctx, item); err != nil {
return fmt.Errorf("failed to delete forwarding rule by name: %w", err)
}
}
return nil
}

func (o *ClusterUninstaller) listForwardingRules(ctx context.Context, typeName string) ([]cloudResource, error) {
return o.listForwardingRulesWithFilter(ctx, typeName, "items(name,region,loadBalancingScheme),nextPageToken", o.isClusterResource)
return o.listForwardingRulesWithFilter(ctx, typeName, "items(name,region,loadBalancingScheme,labels),nextPageToken", func(item *compute.ForwardingRule) bool {
return o.isClusterResource(item.Name) && o.isOwnedResource(item.Labels)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are other places (i.e. listImages in pkg/destroy/gcp/image.go) that check like:

o.isClusterResource(item.Name) && !o.isSharedResource(item.Labels).

Should we make it consistent?

})
}

// listForwardingRulesWithFilter lists forwarding rules in the project that satisfy the filter criteria.
// The fields parameter specifies which fields should be returned in the result, the filter string contains
// a filter string passed to the API to filter results. The filterFunc is a client-side filtering function
// that determines whether a particular result should be returned or not.
func (o *ClusterUninstaller) listForwardingRulesWithFilter(ctx context.Context, typeName, fields string, filterFunc resourceFilterFunc) ([]cloudResource, error) {
func (o *ClusterUninstaller) listForwardingRulesWithFilter(ctx context.Context, typeName, fields string, filterFunc func(item *compute.ForwardingRule) bool) ([]cloudResource, error) {
o.Logger.Debugf("Listing forwarding rules")
ctx, cancel := context.WithTimeout(ctx, defaultTimeout)
defer cancel()
Expand All @@ -33,7 +54,7 @@ func (o *ClusterUninstaller) listForwardingRulesWithFilter(ctx context.Context,

pagesFunc := func(list *compute.ForwardingRuleList) error {
for _, item := range list.Items {
if filterFunc(item.Name) {
if filterFunc(item) {
logrus.Debugf("Found forwarding rule: %s", item.Name)
var quota []gcp.QuotaUsage
if item.LoadBalancingScheme == "EXTERNAL" {
Expand Down
79 changes: 77 additions & 2 deletions pkg/destroy/gcp/gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ const (

// DONE represents the done status for a compute service operation.
DONE = "DONE"

alreadyInUseStr = "is already being used by "
)

// ClusterUninstaller holds the various options for the cluster we want to delete
Expand Down Expand Up @@ -441,6 +443,72 @@ func operationErrorMessage(op *compute.Operation) string {
return strings.Join(errs, ", ")
}

func (o *ClusterUninstaller) handleDependentResourceError(ctx context.Context, op *compute.Operation, err error) error {
errorCode := 0
errMsg := ""
switch {
case op != nil:
errorCode = int(op.HttpErrorStatusCode)
errMsg = operationErrorMessage(op)
case err != nil:
var apiErr *googleapi.Error
if errors.As(err, &apiErr) {
errorCode = apiErr.Code
errMsg = apiErr.Message
} else {
errMsg = err.Error()
}
default:
return fmt.Errorf("failed to extract information from operation or error")
}

if errorCode == 400 && strings.Contains(errMsg, alreadyInUseStr) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

strings.Contains(errMsg, alreadyInUseStr)

❓ How future-proof is this condition? Will it ever change? For example, the wording might change.

I have always wondered if it's OK to parse an error message like this. Any thoughts?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any detail in the returned error that is more deterministic? Maybe an error code like RESOURCE_IN_USE_BY_ANOTHER_RESOURCE as seen in OCPBUGS-62260?

time="2025-09-02T22:22:36Z" level=debug msg="Deleting network ci-op-rmm0ntky-adfcd-v2qc6-network"
time="2025-09-02T22:22:47Z" level=debug msg="failed to delete network ci-op-rmm0ntky-adfcd-v2qc6-network with error: RESOURCE_IN_USE_BY_ANOTHER_RESOURCE: The network resource 'projects/XXXXXXXXXXXXXXXXXXXXXXXX/global/networks/ci-op-rmm0ntky-adfcd-v2qc6-network' is already being used by 'projects/XXXXXXXXXXXXXXXXXXXXXXXX/global/firewalls/k8s-fw-ae92bfcbbc0574c2d8291350ffbbb9a7'"

Also, is there anywhere in the error that can indicate the dependent resource id, rather than the error message?

splitDetails := strings.Split(errMsg, alreadyInUseStr)
resource := strings.ReplaceAll(
strings.ReplaceAll(
strings.ReplaceAll(splitDetails[len(splitDetails)-1], "\"", ""),
"'", "",
), ", resourceInUseByAnotherResource", "",
)
splitResource := strings.Split(resource, "/")

if len(splitResource) > 6 || len(splitResource) < 5 {
return fmt.Errorf("dependent resource information unable to be parsed: %s", resource)
}
// global -> ex: projects/xxxxxxxxxxxxxxxx/global/resource-type/resource-name
// regional -> ex: projects/xxxxxxxxxxxxxx/region/region-name/resource-type/resource-name
location, resourceType, resourceName := splitResource[len(splitResource)-3], splitResource[len(splitResource)-2], splitResource[len(splitResource)-1]
o.Logger.Debugf("found dependent resource information: %s, %s, %s", location, resourceType, resourceName)
Comment on lines +466 to +481
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part can probably be refactored into a separate func, for example:

func (o *ClusterUninstaller) parseResourceFromError(errMsg string) (location, resourceType, resourceName string, err error) {}

But then I have the same question whether it is future-proof to parse the error message?


var deleteErr error
switch resourceType {
case "backendServices":
deleteErr = o.deleteBackendServiceByName(ctx, resourceName, location)
case "firewalls":
deleteErr = o.deleteFirewallByName(ctx, resourceName)
case "forwardingRules":
deleteErr = o.deleteForwardingRuleByName(ctx, resourceName, location)
case "subnetworks":
deleteErr = o.deleteSubnetworkByName(ctx, resourceName)
case "routers":
deleteErr = o.deleteRouterByName(ctx, resourceName)
case "addresses":
deleteErr = o.deleteAddressByName(ctx, resourceName, location)
case "targetPools":
deleteErr = o.deleteTargetPoolByName(ctx, resourceName)
case "targetTcpProxies":
deleteErr = o.deleteTargetTCPProxyByName(ctx, resourceName)
default:
deleteErr = fmt.Errorf("failed to find resource type: %s for %s", resourceType, resourceName)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
deleteErr = fmt.Errorf("failed to find resource type: %s for %s", resourceType, resourceName)
deleteErr = fmt.Errorf("unsupported resource type %q for dependent deletion",
resourceType)

}

if deleteErr != nil {
return fmt.Errorf("failed to delete dependent resource: %w", deleteErr)
}
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should have log when the deletion works:

o.Logger.Infof("Successfully deleted dependent resource %s/%s", resourceType, resourceName)

return nil
}

func (o *ClusterUninstaller) handleOperation(ctx context.Context, op *compute.Operation, err error, item cloudResource, resourceType string) error {
identifier := []string{item.typeName, item.name}
if item.zone != "" {
Expand All @@ -454,13 +522,20 @@ func (o *ClusterUninstaller) handleOperation(ctx context.Context, op *compute.Op
o.deletePendingItems(item.typeName, []cloudResource{item})
return nil
}
return fmt.Errorf("failed to delete %s %s: %w", resourceType, item.name, err)

err = o.handleDependentResourceError(ctx, op, err)
if err != nil {
o.Logger.Debugf("failed to handle dependent resource error: %v", err)
}
Comment on lines +526 to +529
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should still return the error here so that the destroy loop will try again to delete the resource next time, right?

depErr := o.handleDependentResourceError(ctx, op, err)
if depErr != nil {
    // If we successfully identified and attempted to delete a dependent resource,
    // return a more informative error
    return fmt.Errorf("failed to delete %s %s: %w (dependent resource handling: %w)", resourceType, item.name, err, depErr)
}
// If dependent resource was handled successfully, return original error
// The retry logic will attempt deletion again
return fmt.Errorf("failed to delete %s %s: %w", resourceType, item.name, err)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the gcp jobs are failing due to nil pointer panics (for example, ci/prow/e2e-gcp-ovn, ci/prow/e2e-gcp-default-config) are happening because of this 😓

}

// wait for operation to complete before checking any further
op, err = o.waitFor(ctx, op, item)

if op != nil && op.Status == DONE && isErrorStatus(op.HttpErrorStatusCode) {
err = o.handleDependentResourceError(ctx, op, err)
if err != nil {
o.Logger.Debugf("failed to handle dependent resource error: %v", err)
}
Comment on lines +535 to +538
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably use a warning here:

depErr := o.handleDependentResourceError(ctx, op, err)
if depErr != nil {
    o.Logger.Warnf("Dependent resource cleanup failed for %s %s: %v", resourceType, item.name, depErr)
}

o.resetRequestID(identifier...)
return fmt.Errorf("failed to delete %s %s with error: %s", resourceType, item.name, operationErrorMessage(op))
}
Expand Down
8 changes: 5 additions & 3 deletions pkg/destroy/gcp/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,24 @@ const (
)

func (o *ClusterUninstaller) listImages(ctx context.Context) ([]cloudResource, error) {
return o.listImagesWithFilter(ctx, "items(name),nextPageToken", o.isClusterResource)
return o.listImagesWithFilter(ctx, "items(name,labels),nextPageToken", func(item *compute.Image) bool {
return o.isClusterResource(item.Name) && !o.isSharedResource(item.Labels)
})
}

// listImagesWithFilter lists addresses in the project that satisfy the filter criteria.
// The fields parameter specifies which fields should be returned in the result, the filter string contains
// a filter string passed to the API to filter results. The filterFunc is a client-side filtering function
// that determines whether a particular result should be returned or not.
func (o *ClusterUninstaller) listImagesWithFilter(ctx context.Context, fields string, filterFunc resourceFilterFunc) ([]cloudResource, error) {
func (o *ClusterUninstaller) listImagesWithFilter(ctx context.Context, fields string, filterFunc func(item *compute.Image) bool) ([]cloudResource, error) {
o.Logger.Debugf("Listing images")
ctx, cancel := context.WithTimeout(ctx, defaultTimeout)
defer cancel()
result := []cloudResource{}
req := o.computeSvc.Images.List(o.ProjectID).Fields(googleapi.Field(fields))
err := req.Pages(ctx, func(list *compute.ImageList) error {
for _, item := range list.Items {
if filterFunc(item.Name) {
if filterFunc(item) {
o.Logger.Debugf("Found image: %s\n", item.Name)
result = append(result, cloudResource{
key: item.Name,
Expand Down
14 changes: 14 additions & 0 deletions pkg/destroy/gcp/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package gcp

import (
"context"
"fmt"

"github.com/pkg/errors"
"google.golang.org/api/compute/v1"
Expand All @@ -14,6 +15,19 @@ const (
routerResourceName = "router"
)

func (o *ClusterUninstaller) deleteRouterByName(ctx context.Context, resourceName string) error {
items, err := o.listRoutersWithFilter(ctx, "items(name),nextPageToken", func(item string) bool { return item == resourceName })
if err != nil {
return fmt.Errorf("failed to list router by name: %w", err)
}
for _, item := range items {
if err := o.deleteRouter(ctx, item); err != nil {
return fmt.Errorf("failed to delete router by name: %w", err)
}
}
return nil
}

func (o *ClusterUninstaller) listRouters(ctx context.Context) ([]cloudResource, error) {
return o.listRoutersWithFilter(ctx, "items(name),nextPageToken", o.isClusterResource)
}
Expand Down
Loading