Integrate executor into investigation flow to execute actions

bergmannf · claude · bergmannf · commit 6f9cfff4fe7c · 2025-11-12T12:42:28.000+01:00
Extends the main investigate command to automatically execute actions returned by investigations using the executor framework. This completes the migration from direct external system calls to the action pattern. Changes: - Add executeActions() helper that creates an executor and runs all actions from an InvestigationResult - Execute CCAM actions after CCAM investigation runs - Execute main investigation actions after the alert investigation runs - Configure executor with sensible defaults: 3 retries, concurrent execution for independent actions, continue on error - Log action execution success/failure for observability This enables investigations like CHGM to return actions instead of directly calling OCM/PagerDuty APIs, improving testability and separation of concerns. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/cadctl/cmd/investigate/investigate.go b/cadctl/cmd/investigate/investigate.go
@@ -17,14 +17,16 @@ limitations under the License.
 package investigate
 
 import (
+	"context"
 	"errors"
 	"fmt"
 	"os"
 	"strconv"
 	"strings"
 
 	"github.com/openshift/configuration-anomaly-detection/pkg/backplane"
-	"github.com/openshift/configuration-anomaly-detection/pkg/investigations"
+	"github.com/openshift/configuration-anomaly-detection/pkg/executor"
+	investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/precheck"
@@ -202,12 +204,23 @@ func run(_ *cobra.Command, _ []string) error {
 	}
 	updateMetrics(alertInvestigation.Name(), &result)
 
+	// Execute ccam actions if any
+	if err := executeActions(builder, &result, ocmClient, pdClient, "ccam"); err != nil {
+		return fmt.Errorf("failed to execute ccam actions: %w", err)
+	}
+
 	logging.Infof("Starting investigation for %s", alertInvestigation.Name())
 	result, err = alertInvestigation.Run(builder)
 	if err != nil {
 		return err
 	}
 	updateMetrics(alertInvestigation.Name(), &result)
+
+	// Execute investigation actions if any
+	if err := executeActions(builder, &result, ocmClient, pdClient, alertInvestigation.Name()); err != nil {
+		return fmt.Errorf("failed to execute %s actions: %w", alertInvestigation.Name(), err)
+	}
+
 	return updateIncidentTitle(pdClient)
 }
 
@@ -289,3 +302,52 @@ func updateIncidentTitle(pdClient *pagerduty.SdkClient) error {
 	}
 	return nil
 }
+
+// executeActions executes any actions returned by an investigation
+func executeActions(
+	builder investigation.ResourceBuilder,
+	result *investigation.InvestigationResult,
+	ocmClient *ocm.SdkClient,
+	pdClient *pagerduty.SdkClient,
+	investigationName string,
+) error {
+	// If no actions, return early
+	if len(result.Actions) == 0 {
+		logging.Debug("No actions to execute")
+		return nil
+	}
+
+	// Build resources to get cluster and notes
+	resources, err := builder.Build()
+	if err != nil {
+		return fmt.Errorf("failed to build resources for action execution: %w", err)
+	}
+
+	// Create executor
+	exec := executor.NewExecutor(ocmClient, pdClient, logging.RawLogger)
+
+	// Execute actions with default options
+	input := &executor.ExecutorInput{
+		InvestigationName: investigationName,
+		Actions:           result.Actions,
+		Cluster:           resources.Cluster,
+		Notes:             resources.Notes,
+		Options: executor.ExecutionOptions{
+			DryRun:            false,
+			StopOnError:       false, // Continue executing actions even if one fails
+			MaxRetries:        3,
+			ConcurrentActions: true, // Use concurrent execution for better performance
+		},
+	}
+
+	logging.Infof("Executing %d actions for %s", len(result.Actions), investigationName)
+	if err := exec.Execute(context.Background(), input); err != nil {
+		// Log the error but don't fail the investigation
+		// This matches the current behavior where we log failures but continue
+		logging.Errorf("Action execution failed for %s: %v", investigationName, err)
+		return err
+	}
+
+	logging.Infof("Successfully executed all actions for %s", investigationName)
+	return nil
+}
diff --git a/pkg/executor/errors.go b/pkg/executor/errors.go
@@ -1,6 +1,9 @@
 package executor
 
-import "fmt"
+import (
+	"fmt"
+	"strings"
+)
 
 // ActionValidationError indicates an action failed validation
 type ActionValidationError struct {
@@ -37,7 +40,13 @@ type MultipleActionsError struct {
 }
 
 func (e MultipleActionsError) Error() string {
-	return fmt.Sprintf("%d actions failed: %v", len(e.Errors), e.Errors[0])
+	errStrings := make([]string, 0, len(e.Errors))
+	for _, subErr := range e.Errors {
+		errString := fmt.Sprintf("- %s", subErr.Error())
+		errStrings = append(errStrings, errString)
+	}
+	errString := strings.Join(errStrings, "\n")
+	return fmt.Sprintf("%d actions failed: %v", len(e.Errors), errString)
 }
 
 func (e MultipleActionsError) Unwrap() error {
diff --git a/pkg/investigations/chgm/chgm.go b/pkg/investigations/chgm/chgm.go
@@ -18,6 +18,7 @@ import (
 	"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
 	"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
 	"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
+	"github.com/openshift/configuration-anomaly-detection/pkg/reports"
 	hivev1 "github.com/openshift/hive/apis/hive/v1"
 )
 
@@ -105,9 +106,9 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
 
 		// Otherwise, it's an investigation finding (e.g., CloudTrail data too old)
 		// Report this as a finding that needs manual investigation
-		notes.AppendWarning("Could not complete instance investigation: %s", err.Error())
+		r.Notes.AppendWarning("Could not complete instance investigation: %s", err.Error())
 		result.Actions = []types.Action{
-			executor.NoteFrom(notes),
+			executor.NoteFrom(r.Notes),
 			executor.Escalate("Investigation incomplete - manual review required"),
 		}
 		return result, nil
@@ -116,12 +117,12 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
 
 	if !res.UserAuthorized {
 		logging.Infof("Instances were stopped by unauthorized user: %s / arn: %s", res.User.UserName, res.User.IssuerUserName)
-		notes.AppendAutomation("Customer stopped instances. Sent LS and silencing alert.")
+		r.Notes.AppendAutomation("Customer stopped instances. Sent LS and silencing alert.")
 
 		result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"StoppedInstances"}}
 		result.Actions = []types.Action{
 			executor.NewLimitedSupportAction(stoppedInfraLS.Summary, stoppedInfraLS.Details).Build(),
-			executor.NoteFrom(notes),
+			executor.NoteFrom(r.Notes),
 			executor.Silence("Customer stopped instances - cluster in limited support"),
 		}
 		return result, nil
@@ -156,14 +157,14 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
 		logging.Infof("Network verifier reported failure: %s", failureReason)
 
 		if strings.Contains(failureReason, "nosnch.in") {
-			notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.")
+			r.Notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.")
 
 			result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"EgressBlocked"}}
 			result.Actions = []types.Action{
 				executor.NewLimitedSupportAction(egressLS.Summary, egressLS.Details).
 					WithContext("EgressBlocked").
 					Build(),
-				executor.NoteFrom(notes),
+				executor.NoteFrom(r.Notes),
 				executor.Silence("Deadman's snitch blocked - cluster in limited support"),
 			}
 			return result, nil
@@ -172,15 +173,15 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
 		docLink := ocm.DocumentationLink(product, ocm.DocumentationTopicPrivatelinkFirewall)
 		egressSL := createEgressSL(failureReason, docLink)
 
-		notes.AppendWarning("NetworkVerifier found unreachable targets and sent the SL, but deadmanssnitch is not blocked! \n⚠️ Please investigate this cluster.\nUnreachable: \n%s", failureReason)
+		r.Notes.AppendWarning("NetworkVerifier found unreachable targets and sent the SL, but deadmanssnitch is not blocked! \n⚠️ Please investigate this cluster.\nUnreachable: \n%s", failureReason)
 
 		result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
 		result.Actions = []types.Action{
 			executor.NewServiceLogAction(egressSL.Severity, egressSL.Summary).
 				WithDescription(egressSL.Description).
 				WithServiceName(egressSL.ServiceName).
 				Build(),
-			executor.NoteFrom(notes),
+			executor.NoteFrom(r.Notes),
 			executor.Escalate("Egress blocked but not deadman's snitch - manual investigation required"),
 		}
 		return result, nil
@@ -203,7 +204,7 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
 
 	// Found no issues that CAD can handle by itself - forward notes to SRE.
 	result.Actions = []types.Action{
-		executor.NoteFrom(notes),
+		executor.NoteFrom(r.Notes),
 		executor.Escalate("No automated remediation available - manual investigation required"),
 	}
 	return result, nil
diff --git a/pkg/investigations/chgm/chgm_test.go b/pkg/investigations/chgm/chgm_test.go
@@ -11,6 +11,7 @@ import (
 	cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
 	servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1"
 	awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock"
+	backplanemock "github.com/openshift/configuration-anomaly-detection/pkg/backplane/mock"
 	"github.com/openshift/configuration-anomaly-detection/pkg/executor"
 	investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
 	"github.com/openshift/configuration-anomaly-detection/pkg/investigations/types"
@@ -43,10 +44,6 @@ func hasEscalateAction(actions []types.Action) bool {
 	return hasActionType(actions, string(executor.ActionTypeEscalateIncident))
 }
 
-func hasServiceLogAction(actions []types.Action) bool {
-	return hasActionType(actions, string(executor.ActionTypeServiceLog))
-}
-
 func hasNoteAction(actions []types.Action) bool {
 	return hasActionType(actions, string(executor.ActionTypePagerDutyNote))
 }