Skip to content

Commit 6f9cfff

Browse files
bergmannfclaude
andcommitted
Integrate executor into investigation flow to execute actions
Extends the main investigate command to automatically execute actions returned by investigations using the executor framework. This completes the migration from direct external system calls to the action pattern. Changes: - Add executeActions() helper that creates an executor and runs all actions from an InvestigationResult - Execute CCAM actions after CCAM investigation runs - Execute main investigation actions after the alert investigation runs - Configure executor with sensible defaults: 3 retries, concurrent execution for independent actions, continue on error - Log action execution success/failure for observability This enables investigations like CHGM to return actions instead of directly calling OCM/PagerDuty APIs, improving testability and separation of concerns. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent 6a1cac0 commit 6f9cfff

File tree

4 files changed

+85
-16
lines changed

4 files changed

+85
-16
lines changed

cadctl/cmd/investigate/investigate.go

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,16 @@ limitations under the License.
1717
package investigate
1818

1919
import (
20+
"context"
2021
"errors"
2122
"fmt"
2223
"os"
2324
"strconv"
2425
"strings"
2526

2627
"github.com/openshift/configuration-anomaly-detection/pkg/backplane"
27-
"github.com/openshift/configuration-anomaly-detection/pkg/investigations"
28+
"github.com/openshift/configuration-anomaly-detection/pkg/executor"
29+
investigations "github.com/openshift/configuration-anomaly-detection/pkg/investigations"
2830
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
2931
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
3032
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/precheck"
@@ -202,12 +204,23 @@ func run(_ *cobra.Command, _ []string) error {
202204
}
203205
updateMetrics(alertInvestigation.Name(), &result)
204206

207+
// Execute ccam actions if any
208+
if err := executeActions(builder, &result, ocmClient, pdClient, "ccam"); err != nil {
209+
return fmt.Errorf("failed to execute ccam actions: %w", err)
210+
}
211+
205212
logging.Infof("Starting investigation for %s", alertInvestigation.Name())
206213
result, err = alertInvestigation.Run(builder)
207214
if err != nil {
208215
return err
209216
}
210217
updateMetrics(alertInvestigation.Name(), &result)
218+
219+
// Execute investigation actions if any
220+
if err := executeActions(builder, &result, ocmClient, pdClient, alertInvestigation.Name()); err != nil {
221+
return fmt.Errorf("failed to execute %s actions: %w", alertInvestigation.Name(), err)
222+
}
223+
211224
return updateIncidentTitle(pdClient)
212225
}
213226

@@ -289,3 +302,52 @@ func updateIncidentTitle(pdClient *pagerduty.SdkClient) error {
289302
}
290303
return nil
291304
}
305+
306+
// executeActions executes any actions returned by an investigation
307+
func executeActions(
308+
builder investigation.ResourceBuilder,
309+
result *investigation.InvestigationResult,
310+
ocmClient *ocm.SdkClient,
311+
pdClient *pagerduty.SdkClient,
312+
investigationName string,
313+
) error {
314+
// If no actions, return early
315+
if len(result.Actions) == 0 {
316+
logging.Debug("No actions to execute")
317+
return nil
318+
}
319+
320+
// Build resources to get cluster and notes
321+
resources, err := builder.Build()
322+
if err != nil {
323+
return fmt.Errorf("failed to build resources for action execution: %w", err)
324+
}
325+
326+
// Create executor
327+
exec := executor.NewExecutor(ocmClient, pdClient, logging.RawLogger)
328+
329+
// Execute actions with default options
330+
input := &executor.ExecutorInput{
331+
InvestigationName: investigationName,
332+
Actions: result.Actions,
333+
Cluster: resources.Cluster,
334+
Notes: resources.Notes,
335+
Options: executor.ExecutionOptions{
336+
DryRun: false,
337+
StopOnError: false, // Continue executing actions even if one fails
338+
MaxRetries: 3,
339+
ConcurrentActions: true, // Use concurrent execution for better performance
340+
},
341+
}
342+
343+
logging.Infof("Executing %d actions for %s", len(result.Actions), investigationName)
344+
if err := exec.Execute(context.Background(), input); err != nil {
345+
// Log the error but don't fail the investigation
346+
// This matches the current behavior where we log failures but continue
347+
logging.Errorf("Action execution failed for %s: %v", investigationName, err)
348+
return err
349+
}
350+
351+
logging.Infof("Successfully executed all actions for %s", investigationName)
352+
return nil
353+
}

pkg/executor/errors.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package executor
22

3-
import "fmt"
3+
import (
4+
"fmt"
5+
"strings"
6+
)
47

58
// ActionValidationError indicates an action failed validation
69
type ActionValidationError struct {
@@ -37,7 +40,13 @@ type MultipleActionsError struct {
3740
}
3841

3942
func (e MultipleActionsError) Error() string {
40-
return fmt.Sprintf("%d actions failed: %v", len(e.Errors), e.Errors[0])
43+
errStrings := make([]string, 0, len(e.Errors))
44+
for _, subErr := range e.Errors {
45+
errString := fmt.Sprintf("- %s", subErr.Error())
46+
errStrings = append(errStrings, errString)
47+
}
48+
errString := strings.Join(errStrings, "\n")
49+
return fmt.Sprintf("%d actions failed: %v", len(e.Errors), errString)
4150
}
4251

4352
func (e MultipleActionsError) Unwrap() error {

pkg/investigations/chgm/chgm.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
1919
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
2020
"github.com/openshift/configuration-anomaly-detection/pkg/ocm"
21+
"github.com/openshift/configuration-anomaly-detection/pkg/reports"
2122
hivev1 "github.com/openshift/hive/apis/hive/v1"
2223
)
2324

@@ -105,9 +106,9 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
105106

106107
// Otherwise, it's an investigation finding (e.g., CloudTrail data too old)
107108
// Report this as a finding that needs manual investigation
108-
notes.AppendWarning("Could not complete instance investigation: %s", err.Error())
109+
r.Notes.AppendWarning("Could not complete instance investigation: %s", err.Error())
109110
result.Actions = []types.Action{
110-
executor.NoteFrom(notes),
111+
executor.NoteFrom(r.Notes),
111112
executor.Escalate("Investigation incomplete - manual review required"),
112113
}
113114
return result, nil
@@ -116,12 +117,12 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
116117

117118
if !res.UserAuthorized {
118119
logging.Infof("Instances were stopped by unauthorized user: %s / arn: %s", res.User.UserName, res.User.IssuerUserName)
119-
notes.AppendAutomation("Customer stopped instances. Sent LS and silencing alert.")
120+
r.Notes.AppendAutomation("Customer stopped instances. Sent LS and silencing alert.")
120121

121122
result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"StoppedInstances"}}
122123
result.Actions = []types.Action{
123124
executor.NewLimitedSupportAction(stoppedInfraLS.Summary, stoppedInfraLS.Details).Build(),
124-
executor.NoteFrom(notes),
125+
executor.NoteFrom(r.Notes),
125126
executor.Silence("Customer stopped instances - cluster in limited support"),
126127
}
127128
return result, nil
@@ -156,14 +157,14 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
156157
logging.Infof("Network verifier reported failure: %s", failureReason)
157158

158159
if strings.Contains(failureReason, "nosnch.in") {
159-
notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.")
160+
r.Notes.AppendAutomation("Egress `nosnch.in` blocked, sent limited support.")
160161

161162
result.LimitedSupportSet = investigation.InvestigationStep{Performed: true, Labels: []string{"EgressBlocked"}}
162163
result.Actions = []types.Action{
163164
executor.NewLimitedSupportAction(egressLS.Summary, egressLS.Details).
164165
WithContext("EgressBlocked").
165166
Build(),
166-
executor.NoteFrom(notes),
167+
executor.NoteFrom(r.Notes),
167168
executor.Silence("Deadman's snitch blocked - cluster in limited support"),
168169
}
169170
return result, nil
@@ -172,15 +173,15 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
172173
docLink := ocm.DocumentationLink(product, ocm.DocumentationTopicPrivatelinkFirewall)
173174
egressSL := createEgressSL(failureReason, docLink)
174175

175-
notes.AppendWarning("NetworkVerifier found unreachable targets and sent the SL, but deadmanssnitch is not blocked! \n⚠️ Please investigate this cluster.\nUnreachable: \n%s", failureReason)
176+
r.Notes.AppendWarning("NetworkVerifier found unreachable targets and sent the SL, but deadmanssnitch is not blocked! \n⚠️ Please investigate this cluster.\nUnreachable: \n%s", failureReason)
176177

177178
result.ServiceLogSent = investigation.InvestigationStep{Performed: true, Labels: nil}
178179
result.Actions = []types.Action{
179180
executor.NewServiceLogAction(egressSL.Severity, egressSL.Summary).
180181
WithDescription(egressSL.Description).
181182
WithServiceName(egressSL.ServiceName).
182183
Build(),
183-
executor.NoteFrom(notes),
184+
executor.NoteFrom(r.Notes),
184185
executor.Escalate("Egress blocked but not deadman's snitch - manual investigation required"),
185186
}
186187
return result, nil
@@ -203,7 +204,7 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv
203204

204205
// Found no issues that CAD can handle by itself - forward notes to SRE.
205206
result.Actions = []types.Action{
206-
executor.NoteFrom(notes),
207+
executor.NoteFrom(r.Notes),
207208
executor.Escalate("No automated remediation available - manual investigation required"),
208209
}
209210
return result, nil

pkg/investigations/chgm/chgm_test.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
1212
servicelogsv1 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1"
1313
awsmock "github.com/openshift/configuration-anomaly-detection/pkg/aws/mock"
14+
backplanemock "github.com/openshift/configuration-anomaly-detection/pkg/backplane/mock"
1415
"github.com/openshift/configuration-anomaly-detection/pkg/executor"
1516
investigation "github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
1617
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/types"
@@ -43,10 +44,6 @@ func hasEscalateAction(actions []types.Action) bool {
4344
return hasActionType(actions, string(executor.ActionTypeEscalateIncident))
4445
}
4546

46-
func hasServiceLogAction(actions []types.Action) bool {
47-
return hasActionType(actions, string(executor.ActionTypeServiceLog))
48-
}
49-
5047
func hasNoteAction(actions []types.Action) bool {
5148
return hasActionType(actions, string(executor.ActionTypePagerDutyNote))
5249
}

0 commit comments

Comments
 (0)