-
Notifications
You must be signed in to change notification settings - Fork 1.5k
CORS-4278: Hardening the GCP destroy code to attempt to delete dependent resources #10035
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,7 +50,6 @@ func (o *ClusterUninstaller) listCloudControllerBackendServices(ctx context.Cont | |
|
|
||
| fwList, err := o.computeSvc.Firewalls.List(o.ProjectID).Fields(googleapi.Field("items(name,targetTags),nextPageToken")).Context(ctx).Do() | ||
| if err != nil { | ||
| o.Logger.Debugf("failed to list firewall rules associated with backend service %s: %v", item.Name, err) | ||
| return false | ||
| } | ||
| for _, fw := range fwList.Items { | ||
|
|
@@ -103,9 +102,7 @@ func (o *ClusterUninstaller) listCloudControllerTargetPools(ctx context.Context, | |
| break | ||
| } | ||
| } | ||
|
|
||
| if !foundClusterResource { | ||
| o.Logger.Debugf("Skipping target pool instance %s because it is not a cluster resource", item.Name) | ||
| return false | ||
| } | ||
| } | ||
|
|
@@ -121,7 +118,10 @@ func (o *ClusterUninstaller) discoverCloudControllerLoadBalancerResources(ctx co | |
| loadBalancerFilterFunc := o.createLoadBalancerFilterFunc(loadBalancerName) | ||
|
|
||
| // Discover associated addresses: loadBalancerName | ||
| found, err := o.listAddressesWithFilter(ctx, regionalAddressResource, "items(name),nextPageToken", loadBalancerFilterFunc) | ||
| found, err := o.listAddressesWithFilter(ctx, regionalAddressResource, "items(name),nextPageToken", func(item *compute.Address) bool { | ||
| // address should have labels but none observed | ||
| return strings.HasPrefix(item.Name, loadBalancerName) | ||
| }) | ||
|
Comment on lines
+122
to
+124
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has the same check as |
||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
@@ -160,7 +160,12 @@ func (o *ClusterUninstaller) discoverCloudControllerLoadBalancerResources(ctx co | |
| o.insertPendingItems(firewallResourceName, found) | ||
|
|
||
| // Discover associated forwarding rules: loadBalancerName | ||
| found, err = o.listForwardingRulesWithFilter(ctx, regionForwardingRuleResource, "items(name),nextPageToken", loadBalancerFilterFunc) | ||
| found, err = o.listForwardingRulesWithFilter(ctx, regionForwardingRuleResource, "items(name,labels),nextPageToken", func(item *compute.ForwardingRule) bool { | ||
| // The forwarding rule should be checked for labels to ensure that | ||
| // it is owned and not shared. However, the forwarding rules associated with | ||
| // load balancers do not have these labels. | ||
| return strings.HasPrefix(item.Name, loadBalancerName) | ||
| }) | ||
|
Comment on lines
+164
to
+168
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same question as above: This has the same check as |
||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
@@ -203,7 +208,6 @@ func (o *ClusterUninstaller) discoverCloudControllerLoadBalancerResources(ctx co | |
| // For each of those backend services, resources like forwarding rules, firewalls, health checks and | ||
| // backend services are added to pendingItems | ||
| func (o *ClusterUninstaller) discoverCloudControllerResources(ctx context.Context) error { | ||
| o.Logger.Debugf("Discovering cloud controller resources") | ||
| errs := []error{} | ||
|
|
||
| // Instance group related items | ||
|
|
@@ -222,7 +226,6 @@ func (o *ClusterUninstaller) discoverCloudControllerResources(ctx context.Contex | |
| return err | ||
| } | ||
| for _, backend := range backends { | ||
| o.Logger.Debugf("Discovering cloud controller resources for %s", backend.name) | ||
| err := o.discoverCloudControllerLoadBalancerResources(ctx, backend.name) | ||
| if err != nil { | ||
| errs = append(errs, err) | ||
|
|
@@ -244,7 +247,6 @@ func (o *ClusterUninstaller) discoverCloudControllerResources(ctx context.Contex | |
| return err | ||
| } | ||
| for _, pool := range pools { | ||
| o.Logger.Debugf("Discovering cloud controller resources for %s", pool.name) | ||
| err := o.discoverCloudControllerLoadBalancerResources(ctx, pool.name) | ||
| if err != nil { | ||
| errs = append(errs, err) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ package gcp | |
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "strings" | ||
|
|
||
| "github.com/pkg/errors" | ||
|
|
@@ -15,6 +16,19 @@ const ( | |
| firewallResourceName = "firewall" | ||
| ) | ||
|
|
||
| func (o *ClusterUninstaller) deleteFirewallByName(ctx context.Context, resourceName string) error { | ||
| items, err := o.listFirewallsWithFilter(ctx, "items(name),nextPageToken", func(item string) bool { return strings.Contains(item, resourceName) }) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same question as above: should we make it consistent as currently we use both |
||
| if err != nil { | ||
| return fmt.Errorf("failed to list firewall by name: %w", err) | ||
| } | ||
| for _, item := range items { | ||
| if err := o.deleteFirewall(ctx, item); err != nil { | ||
| return fmt.Errorf("failed to delete firewall by name: %w", err) | ||
| } | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func (o *ClusterUninstaller) listFirewalls(ctx context.Context) ([]cloudResource, error) { | ||
| // The firewall rules that the destroyer is searching for here include a | ||
| // pattern before and after the cluster ID. Use a regular expression that allows | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,15 +16,36 @@ const ( | |
| regionForwardingRuleResource = "regionalforwardingrule" | ||
| ) | ||
|
|
||
| func (o *ClusterUninstaller) deleteForwardingRuleByName(ctx context.Context, resourceName, location string) error { | ||
| typeName := regionForwardingRuleResource | ||
| if location == string(global) { | ||
| typeName = globalForwardingRuleResource | ||
| } | ||
| items, err := o.listForwardingRulesWithFilter(ctx, typeName, "items(name,labels),nextPageToken", func(item *compute.ForwardingRule) bool { | ||
| return item.Name == resourceName && o.isOwnedResource(item.Labels) | ||
| }) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to list forwarding rules by name: %w", err) | ||
| } | ||
| for _, item := range items { | ||
| if err := o.deleteForwardingRule(ctx, item); err != nil { | ||
| return fmt.Errorf("failed to delete forwarding rule by name: %w", err) | ||
| } | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func (o *ClusterUninstaller) listForwardingRules(ctx context.Context, typeName string) ([]cloudResource, error) { | ||
| return o.listForwardingRulesWithFilter(ctx, typeName, "items(name,region,loadBalancingScheme),nextPageToken", o.isClusterResource) | ||
| return o.listForwardingRulesWithFilter(ctx, typeName, "items(name,region,loadBalancingScheme,labels),nextPageToken", func(item *compute.ForwardingRule) bool { | ||
| return o.isClusterResource(item.Name) && o.isOwnedResource(item.Labels) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are other places (i.e.
Should we make it consistent? |
||
| }) | ||
| } | ||
|
|
||
| // listForwardingRulesWithFilter lists forwarding rules in the project that satisfy the filter criteria. | ||
| // The fields parameter specifies which fields should be returned in the result, the filter string contains | ||
| // a filter string passed to the API to filter results. The filterFunc is a client-side filtering function | ||
| // that determines whether a particular result should be returned or not. | ||
| func (o *ClusterUninstaller) listForwardingRulesWithFilter(ctx context.Context, typeName, fields string, filterFunc resourceFilterFunc) ([]cloudResource, error) { | ||
| func (o *ClusterUninstaller) listForwardingRulesWithFilter(ctx context.Context, typeName, fields string, filterFunc func(item *compute.ForwardingRule) bool) ([]cloudResource, error) { | ||
| o.Logger.Debugf("Listing forwarding rules") | ||
| ctx, cancel := context.WithTimeout(ctx, defaultTimeout) | ||
| defer cancel() | ||
|
|
@@ -33,7 +54,7 @@ func (o *ClusterUninstaller) listForwardingRulesWithFilter(ctx context.Context, | |
|
|
||
| pagesFunc := func(list *compute.ForwardingRuleList) error { | ||
| for _, item := range list.Items { | ||
| if filterFunc(item.Name) { | ||
| if filterFunc(item) { | ||
| logrus.Debugf("Found forwarding rule: %s", item.Name) | ||
| var quota []gcp.QuotaUsage | ||
| if item.LoadBalancingScheme == "EXTERNAL" { | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -45,6 +45,8 @@ const ( | |||||||
|
|
||||||||
| // DONE represents the done status for a compute service operation. | ||||||||
| DONE = "DONE" | ||||||||
|
|
||||||||
| alreadyInUseStr = "is already being used by " | ||||||||
| ) | ||||||||
|
|
||||||||
| // ClusterUninstaller holds the various options for the cluster we want to delete | ||||||||
|
|
@@ -441,6 +443,72 @@ func operationErrorMessage(op *compute.Operation) string { | |||||||
| return strings.Join(errs, ", ") | ||||||||
| } | ||||||||
|
|
||||||||
| func (o *ClusterUninstaller) handleDependentResourceError(ctx context.Context, op *compute.Operation, err error) error { | ||||||||
| errorCode := 0 | ||||||||
| errMsg := "" | ||||||||
| switch { | ||||||||
| case op != nil: | ||||||||
| errorCode = int(op.HttpErrorStatusCode) | ||||||||
| errMsg = operationErrorMessage(op) | ||||||||
| case err != nil: | ||||||||
| var apiErr *googleapi.Error | ||||||||
| if errors.As(err, &apiErr) { | ||||||||
| errorCode = apiErr.Code | ||||||||
| errMsg = apiErr.Message | ||||||||
| } else { | ||||||||
| errMsg = err.Error() | ||||||||
| } | ||||||||
| default: | ||||||||
| return fmt.Errorf("failed to extract information from operation or error") | ||||||||
| } | ||||||||
|
|
||||||||
| if errorCode == 400 && strings.Contains(errMsg, alreadyInUseStr) { | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. strings.Contains(errMsg, alreadyInUseStr)❓ How future-proof is this condition? Will it ever change? For example, the wording might change. I have always wondered if it's OK to parse an error message like this. Any thoughts?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any detail in the returned error that is more deterministic? Maybe an error code like
Also, is there anywhere in the error that can indicate the dependent resource id, rather than the error message? |
||||||||
| splitDetails := strings.Split(errMsg, alreadyInUseStr) | ||||||||
| resource := strings.ReplaceAll( | ||||||||
| strings.ReplaceAll( | ||||||||
| strings.ReplaceAll(splitDetails[len(splitDetails)-1], "\"", ""), | ||||||||
| "'", "", | ||||||||
| ), ", resourceInUseByAnotherResource", "", | ||||||||
| ) | ||||||||
| splitResource := strings.Split(resource, "/") | ||||||||
|
|
||||||||
| if len(splitResource) > 6 || len(splitResource) < 5 { | ||||||||
| return fmt.Errorf("dependent resource information unable to be parsed: %s", resource) | ||||||||
| } | ||||||||
| // global -> ex: projects/xxxxxxxxxxxxxxxx/global/resource-type/resource-name | ||||||||
| // regional -> ex: projects/xxxxxxxxxxxxxx/region/region-name/resource-type/resource-name | ||||||||
| location, resourceType, resourceName := splitResource[len(splitResource)-3], splitResource[len(splitResource)-2], splitResource[len(splitResource)-1] | ||||||||
| o.Logger.Debugf("found dependent resource information: %s, %s, %s", location, resourceType, resourceName) | ||||||||
|
Comment on lines
+466
to
+481
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part can probably be refactored into a separate func, for example: func (o *ClusterUninstaller) parseResourceFromError(errMsg string) (location, resourceType, resourceName string, err error) {}But then I have the same question whether it is future-proof to parse the error message? |
||||||||
|
|
||||||||
| var deleteErr error | ||||||||
| switch resourceType { | ||||||||
| case "backendServices": | ||||||||
| deleteErr = o.deleteBackendServiceByName(ctx, resourceName, location) | ||||||||
| case "firewalls": | ||||||||
| deleteErr = o.deleteFirewallByName(ctx, resourceName) | ||||||||
| case "forwardingRules": | ||||||||
| deleteErr = o.deleteForwardingRuleByName(ctx, resourceName, location) | ||||||||
| case "subnetworks": | ||||||||
| deleteErr = o.deleteSubnetworkByName(ctx, resourceName) | ||||||||
| case "routers": | ||||||||
| deleteErr = o.deleteRouterByName(ctx, resourceName) | ||||||||
| case "addresses": | ||||||||
| deleteErr = o.deleteAddressByName(ctx, resourceName, location) | ||||||||
| case "targetPools": | ||||||||
| deleteErr = o.deleteTargetPoolByName(ctx, resourceName) | ||||||||
| case "targetTcpProxies": | ||||||||
| deleteErr = o.deleteTargetTCPProxyByName(ctx, resourceName) | ||||||||
| default: | ||||||||
| deleteErr = fmt.Errorf("failed to find resource type: %s for %s", resourceType, resourceName) | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| } | ||||||||
|
|
||||||||
| if deleteErr != nil { | ||||||||
| return fmt.Errorf("failed to delete dependent resource: %w", deleteErr) | ||||||||
| } | ||||||||
| } | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should have log when the deletion works: o.Logger.Infof("Successfully deleted dependent resource %s/%s", resourceType, resourceName) |
||||||||
| return nil | ||||||||
| } | ||||||||
|
|
||||||||
| func (o *ClusterUninstaller) handleOperation(ctx context.Context, op *compute.Operation, err error, item cloudResource, resourceType string) error { | ||||||||
| identifier := []string{item.typeName, item.name} | ||||||||
| if item.zone != "" { | ||||||||
|
|
@@ -454,13 +522,20 @@ func (o *ClusterUninstaller) handleOperation(ctx context.Context, op *compute.Op | |||||||
| o.deletePendingItems(item.typeName, []cloudResource{item}) | ||||||||
| return nil | ||||||||
| } | ||||||||
| return fmt.Errorf("failed to delete %s %s: %w", resourceType, item.name, err) | ||||||||
|
|
||||||||
| err = o.handleDependentResourceError(ctx, op, err) | ||||||||
| if err != nil { | ||||||||
| o.Logger.Debugf("failed to handle dependent resource error: %v", err) | ||||||||
| } | ||||||||
|
Comment on lines
+526
to
+529
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should still return the error here so that the destroy loop will try again to delete the resource next time, right? depErr := o.handleDependentResourceError(ctx, op, err)
if depErr != nil {
// If we successfully identified and attempted to delete a dependent resource,
// return a more informative error
return fmt.Errorf("failed to delete %s %s: %w (dependent resource handling: %w)", resourceType, item.name, err, depErr)
}
// If dependent resource was handled successfully, return original error
// The retry logic will attempt deletion again
return fmt.Errorf("failed to delete %s %s: %w", resourceType, item.name, err)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the gcp jobs are failing due to nil pointer panics (for example, ci/prow/e2e-gcp-ovn, ci/prow/e2e-gcp-default-config) are happening because of this 😓 |
||||||||
| } | ||||||||
|
|
||||||||
| // wait for operation to complete before checking any further | ||||||||
| op, err = o.waitFor(ctx, op, item) | ||||||||
|
|
||||||||
| if op != nil && op.Status == DONE && isErrorStatus(op.HttpErrorStatusCode) { | ||||||||
| err = o.handleDependentResourceError(ctx, op, err) | ||||||||
| if err != nil { | ||||||||
| o.Logger.Debugf("failed to handle dependent resource error: %v", err) | ||||||||
| } | ||||||||
|
Comment on lines
+535
to
+538
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should probably use a warning here: depErr := o.handleDependentResourceError(ctx, op, err)
if depErr != nil {
o.Logger.Warnf("Dependent resource cleanup failed for %s %s: %v", resourceType, item.name, depErr)
} |
||||||||
| o.resetRequestID(identifier...) | ||||||||
| return fmt.Errorf("failed to delete %s %s with error: %s", resourceType, item.name, operationErrorMessage(op)) | ||||||||
| } | ||||||||
|
|
||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We probably still want to keep this log?