From 2c4a32b9b2efeb0299ff279c0281c1d1a42a28c3 Mon Sep 17 00:00:00 2001 From: Benjamin Samuels <1222451+bsamuels453@users.noreply.github.com> Date: Thu, 11 Jan 2024 08:15:15 -0800 Subject: [PATCH 1/6] add artifact writer --- pkg/artifacts/artifacts.go | 74 ++++++++++++ pkg/artifacts/types.go | 1 + pkg/health/checker.go | 26 ++-- pkg/health/ethereum/consensus.go | 2 +- pkg/health/ethereum/execution_rpc.go | 94 ++++++++++++++- pkg/health/ethereum/network_checker.go | 158 ++++++++++--------------- pkg/health/types/types.go | 24 +++- pkg/runtime.go | 87 ++------------ 8 files changed, 277 insertions(+), 189 deletions(-) create mode 100644 pkg/artifacts/artifacts.go create mode 100644 pkg/artifacts/types.go diff --git a/pkg/artifacts/artifacts.go b/pkg/artifacts/artifacts.go new file mode 100644 index 0000000..6f619e0 --- /dev/null +++ b/pkg/artifacts/artifacts.go @@ -0,0 +1,74 @@ +package artifacts + +import ( + chaosMesh "attacknet/cmd/pkg/chaos-mesh" + "attacknet/cmd/pkg/health" + healthTypes "attacknet/cmd/pkg/health/types" + "attacknet/cmd/pkg/types" + "errors" + "fmt" + "github.com/kurtosis-tech/stacktrace" + log "github.com/sirupsen/logrus" + "gopkg.in/yaml.v3" + "os" + path2 "path" + "time" +) + +type TestArtifact struct { + TestDescription string `yaml:"test_description"` + ContainersTargeted []string `yaml:"fault_injection_targets"` + TestPassed bool `yaml:"test_passed"` + HealthResult *healthTypes.HealthCheckResult `yaml:"health_check_results"` +} + +func BuildTestArtifact( + healthResults *healthTypes.HealthCheckResult, + podsUnderTest []*chaosMesh.PodUnderTest, + test types.SuiteTest, +) *TestArtifact { + + var containersTargeted []string + for _, p := range podsUnderTest { + containersTargeted = append(containersTargeted, p.GetName()) + } + + testPassed := health.AllChecksPassed(healthResults) + + return &TestArtifact{ + test.TestName, + containersTargeted, + testPassed, + healthResults, + } +} + +func SerializeTestArtifacts(artifacts []*TestArtifact) error { + artifactFilename := fmt.Sprintf("results-%d.yaml", time.Now().UnixMilli()) + + cwd, err := os.Getwd() + if err != nil { + return err + } + path := path2.Join(cwd, "artifacts") + + if _, err := os.Stat(path); errors.Is(err, os.ErrNotExist) { + err := os.Mkdir(path, os.ModePerm) + if err != nil { + log.Println(err) + } + } + + artifactPath := path2.Join(path, artifactFilename) + bs, err := yaml.Marshal(artifacts) + if err != nil { + return stacktrace.Propagate(err, "could not marshal test artifacts") + } + + err = os.WriteFile(artifactPath, bs, 0600) + if err != nil { + return stacktrace.Propagate(err, "could not write artifacts to %s", artifactPath) + } + log.Infof("Wrote test artifact to %s", artifactPath) + return nil +} diff --git a/pkg/artifacts/types.go b/pkg/artifacts/types.go new file mode 100644 index 0000000..33378d0 --- /dev/null +++ b/pkg/artifacts/types.go @@ -0,0 +1 @@ +package artifacts diff --git a/pkg/health/checker.go b/pkg/health/checker.go index 6161ba9..74cdc53 100644 --- a/pkg/health/checker.go +++ b/pkg/health/checker.go @@ -33,16 +33,18 @@ func BuildHealthChecker(cfg *confTypes.ConfigParsed, kubeClient *kubernetes.Kube return &CheckOrchestrator{checkerImpl: checkerImpl, gracePeriod: healthCheckConfig.GracePeriod}, nil } -func (hc *CheckOrchestrator) RunChecks(ctx context.Context) ([]*types.CheckResult, error) { +func (hc *CheckOrchestrator) RunChecks(ctx context.Context) (*types.HealthCheckResult, error) { start := time.Now() latestAllowable := start.Add(hc.gracePeriod) log.Infof("Allowing up to %.0f seconds for health checks to pass on all nodes", hc.gracePeriod.Seconds()) + lastHealthCheckResult := &types.HealthCheckResult{} for { - results, err := hc.checkerImpl.RunAllChecks(ctx) + results, err := hc.checkerImpl.RunAllChecks(ctx, lastHealthCheckResult) if err != nil { return nil, err } + lastHealthCheckResult = results if AllChecksPassed(results) { timeToPass := time.Since(start).Seconds() pctGraceUsed := timeToPass / hc.gracePeriod.Seconds() * 100 @@ -52,7 +54,7 @@ func (hc *CheckOrchestrator) RunChecks(ctx context.Context) ([]*types.CheckResul if time.Now().After(latestAllowable) { log.Warn("Grace period elapsed and a health check is still failing") - return results, stacktrace.NewError("tests failed") + return results, nil } else { log.Warn("Health checks failed but still in grace period") time.Sleep(1 * time.Second) @@ -60,11 +62,19 @@ func (hc *CheckOrchestrator) RunChecks(ctx context.Context) ([]*types.CheckResul } } -func AllChecksPassed(checks []*types.CheckResult) bool { - for _, r := range checks { - if len(r.PodsFailing) != 0 { - return false - } +func AllChecksPassed(checks *types.HealthCheckResult) bool { + if len(checks.LatestElBlockResult.FailingClientsReportedBlock) > 0 { + return false + } + if len(checks.LatestElBlockResult.FailingClientsReportedHash) > 0 { + return false } + if len(checks.FinalizedElBlockResult.FailingClientsReportedBlock) > 0 { + return false + } + if len(checks.FinalizedElBlockResult.FailingClientsReportedHash) > 0 { + return false + } + return true } diff --git a/pkg/health/ethereum/consensus.go b/pkg/health/ethereum/consensus.go index 33ccc8b..7e03067 100644 --- a/pkg/health/ethereum/consensus.go +++ b/pkg/health/ethereum/consensus.go @@ -16,7 +16,7 @@ type ClientForkChoice struct { BlockHash string } -func getExecNetworkConsensus(ctx context.Context, nodeClients []*ExecRpcClient, blockType string) ([]*ClientForkChoice, error) { +func getExecNetworkConsensus(ctx context.Context, nodeClients []*ExecClientRPC, blockType string) ([]*ClientForkChoice, error) { clientForkVotes := make([]*ClientForkChoice, len(nodeClients)) for i, client := range nodeClients { choice, err := client.GetLatestBlockBy(ctx, blockType) diff --git a/pkg/health/ethereum/execution_rpc.go b/pkg/health/ethereum/execution_rpc.go index 3ffd628..42f3a6c 100644 --- a/pkg/health/ethereum/execution_rpc.go +++ b/pkg/health/ethereum/execution_rpc.go @@ -1,33 +1,117 @@ package ethereum import ( + "attacknet/cmd/pkg/health/types" "attacknet/cmd/pkg/kubernetes" "context" "fmt" geth "github.com/ethereum/go-ethereum/core/types" "github.com/ethereum/go-ethereum/ethclient" "github.com/kurtosis-tech/stacktrace" + log "github.com/sirupsen/logrus" + "time" ) -type ExecRpcClient struct { +type ExecClientRPC struct { session *kubernetes.PortForwardsSession client *ethclient.Client } -func CreateExecRpcClient(session *kubernetes.PortForwardsSession) (*ExecRpcClient, error) { +func (e *EthNetworkChecker) getExecBlockConsensus(ctx context.Context, clients []*ExecClientRPC, blockType string, maxAttempts int) (*types.BlockConsensusTestResult, error) { + forkChoice, err := getExecNetworkConsensus(ctx, clients, blockType) + if err != nil { + return nil, err + } + // determine whether the nodes are in consensus + consensusBlockNum, wrongBlockNum, consensusBlockHash, wrongBlockHash := determineForkConsensus(forkChoice) + if len(wrongBlockNum) > 0 { + if maxAttempts > 0 { + log.Debugf("Nodes not at consensus for %s block. Waiting and re-trying in case we're on block propagation boundary. Attempts left: %d", blockType, maxAttempts-1) + time.Sleep(1 * time.Second) + return e.getExecBlockConsensus(ctx, clients, blockType, maxAttempts-1) + } else { + reportConsensusDataToLogger(blockType, consensusBlockNum, wrongBlockNum, consensusBlockHash, wrongBlockHash) + } + } + + blockNumWrong := make(map[string]uint64) + for _, node := range wrongBlockNum { + blockNumWrong[node.Pod.GetName()] = node.BlockNumber + } + + blockHashWrong := make(map[string]string) + + for _, node := range wrongBlockHash { + blockHashWrong[node.Pod.GetName()] = node.BlockHash + } + reportConsensusDataToLogger(blockType, consensusBlockNum, wrongBlockNum, consensusBlockHash, wrongBlockHash) + return &types.BlockConsensusTestResult{ + ConsensusBlock: (consensusBlockNum)[0].BlockNumber, + ConsensusHash: consensusBlockHash[0].BlockHash, + FailingClientsReportedBlock: blockNumWrong, + FailingClientsReportedHash: blockHashWrong, + }, nil +} + +func (e *EthNetworkChecker) dialToExecutionClients(ctx context.Context) ([]*ExecClientRPC, error) { + labelKey := "kurtosistech.com.custom/ethereum-package.client-type" + labelValue := "execution" + var podsToHealthCheck []kubernetes.KubePod + // add pods under test that match the label criteria + for _, pod := range e.podsUnderTest { + if pod.MatchesLabel(labelKey, labelValue) && !pod.ExpectDeath { + podsToHealthCheck = append(podsToHealthCheck, pod) + } + } + // add pods that were not targeted by a fault + bystanders, err := e.kubeClient.PodsMatchingLabel(ctx, labelKey, labelValue) + if err != nil { + return nil, err + } + for _, pod := range bystanders { + _, match := e.podsUnderTestLookup[pod.GetName()] + // don't add pods we've already added + if !match { + podsToHealthCheck = append(podsToHealthCheck, pod) + } + } + + log.Debugf("Starting port forward sessions to %d pods", len(podsToHealthCheck)) + portForwardSessions, err := e.kubeClient.StartMultiPortForwardToLabeledPods( + podsToHealthCheck, + labelKey, + labelValue, + 8545) + if err != nil { + return nil, err + } + + // dial out to clients + rpcClients := make([]*ExecClientRPC, len(portForwardSessions)) + for i, s := range portForwardSessions { + client, err := dialExecRpcClient(s) + if err != nil { + return nil, err + } + rpcClients[i] = client + } + return rpcClients, nil +} + +func dialExecRpcClient(session *kubernetes.PortForwardsSession) (*ExecClientRPC, error) { c, err := ethclient.Dial(fmt.Sprintf("http://localhost:%d", session.LocalPort)) if err != nil { return nil, stacktrace.Propagate(err, "err while dialing RPC for %s", session.Pod.GetName()) } - return &ExecRpcClient{session: session, client: c}, nil + return &ExecClientRPC{session: session, client: c}, nil } -func (c *ExecRpcClient) Close() { +func (c *ExecClientRPC) Close() { c.client.Close() c.session.Close() } -func (c *ExecRpcClient) GetLatestBlockBy(ctx context.Context, blockType string) (*ClientForkChoice, error) { +func (c *ExecClientRPC) GetLatestBlockBy(ctx context.Context, blockType string) (*ClientForkChoice, error) { // todo: handle pods that died and we didn't expect it var head *geth.Header var choice *ClientForkChoice diff --git a/pkg/health/ethereum/network_checker.go b/pkg/health/ethereum/network_checker.go index 7597963..2ab3687 100644 --- a/pkg/health/ethereum/network_checker.go +++ b/pkg/health/ethereum/network_checker.go @@ -4,16 +4,16 @@ import ( chaos_mesh "attacknet/cmd/pkg/chaos-mesh" "attacknet/cmd/pkg/kubernetes" "context" - "fmt" log "github.com/sirupsen/logrus" "time" ) import "attacknet/cmd/pkg/health/types" type EthNetworkChecker struct { - kubeClient *kubernetes.KubeClient - podsUnderTest []*chaos_mesh.PodUnderTest - podsUnderTestLookup map[string]*chaos_mesh.PodUnderTest + kubeClient *kubernetes.KubeClient + podsUnderTest []*chaos_mesh.PodUnderTest + podsUnderTestLookup map[string]*chaos_mesh.PodUnderTest + healthCheckStartTime time.Time } func CreateEthNetworkChecker(kubeClient *kubernetes.KubeClient, podsUnderTest []*chaos_mesh.PodUnderTest) *EthNetworkChecker { @@ -25,124 +25,92 @@ func CreateEthNetworkChecker(kubeClient *kubernetes.KubeClient, podsUnderTest [] } return &EthNetworkChecker{ - podsUnderTest: podsUnderTest, - podsUnderTestLookup: podsUnderTestMap, - kubeClient: kubeClient, + podsUnderTest: podsUnderTest, + podsUnderTestLookup: podsUnderTestMap, + kubeClient: kubeClient, + healthCheckStartTime: time.Now(), } } -func (e *EthNetworkChecker) RunAllChecks(ctx context.Context) ([]*types.CheckResult, error) { - labelKey := "kurtosistech.com.custom/ethereum-package.client-type" - labelValue := "execution" - - var podsToHealthCheck []kubernetes.KubePod - // add pods under test that match the label criteria - for _, pod := range e.podsUnderTest { - if pod.MatchesLabel(labelKey, labelValue) && !pod.ExpectDeath { - podsToHealthCheck = append(podsToHealthCheck, pod) - } - } - // add pods that were not targeted by a fault - bystanders, err := e.kubeClient.PodsMatchingLabel(ctx, labelKey, labelValue) - if err != nil { - return nil, err - } - for _, pod := range bystanders { - _, match := e.podsUnderTestLookup[pod.GetName()] - // don't add pods we've already added - if !match { - podsToHealthCheck = append(podsToHealthCheck, pod) - } - } - - log.Infof("Starting port forward sessions to %d pods", len(podsToHealthCheck)) - portForwardSessions, err := e.kubeClient.StartMultiPortForwardToLabeledPods( - podsToHealthCheck, - labelKey, - labelValue, - 8545) +func (e *EthNetworkChecker) RunAllChecks(ctx context.Context, prevHealthCheckResult *types.HealthCheckResult) (*types.HealthCheckResult, error) { + execRpcClients, err := e.dialToExecutionClients(ctx) if err != nil { return nil, err } - // dial out to clients - rpcClients := make([]*ExecRpcClient, len(portForwardSessions)) - for i, s := range portForwardSessions { - client, err := CreateExecRpcClient(s) - if err != nil { - return nil, err - } - rpcClients[i] = client - } - log.Debug("Ready to query for health checks") - latestResult, err := e.getBlockConsensus(ctx, rpcClients, "latest", 3) + latestResult, err := e.getExecBlockConsensus(ctx, execRpcClients, "latest", 5) if err != nil { return nil, err } - finalResult, err := e.getBlockConsensus(ctx, rpcClients, "finalized", 3) + latestArtifact := e.convertResultToArtifact(prevHealthCheckResult.LatestElBlockResult, latestResult) + + finalResult, err := e.getExecBlockConsensus(ctx, execRpcClients, "finalized", 3) if err != nil { return nil, err } + finalArtifact := e.convertResultToArtifact(prevHealthCheckResult.FinalizedElBlockResult, finalResult) - log.Infof("Finalization -> latest lag: %d", latestResult.ConsensusBlockNum-finalResult.ConsensusBlockNum) + log.Debugf("Finalization -> latest lag: %d", latestResult.ConsensusBlock-finalResult.ConsensusBlock) - // construct results - results := make([]*types.CheckResult, 4) - results[0] = latestResult.BlockNumResult - results[1] = latestResult.BlockHashResult - results[2] = finalResult.BlockNumResult - results[3] = finalResult.BlockHashResult + results := &types.HealthCheckResult{ + LatestElBlockResult: latestArtifact, + FinalizedElBlockResult: finalArtifact, + } return results, nil } -type getBlockConsensusResult struct { - BlockNumResult *types.CheckResult - BlockHashResult *types.CheckResult - ConsensusBlockNum uint64 - ConsensusBlockHash string -} +func (e *EthNetworkChecker) convertResultToArtifact( + prevArtifact *types.BlockConsensusArtifact, + result *types.BlockConsensusTestResult) *types.BlockConsensusArtifact { -func (e *EthNetworkChecker) getBlockConsensus(ctx context.Context, clients []*ExecRpcClient, blockType string, maxAttempts int) (*getBlockConsensusResult, error) { - forkChoice, err := getExecNetworkConsensus(ctx, clients, blockType) - if err != nil { - return nil, err + timeSinceChecksStarted := time.Since(e.healthCheckStartTime) + recoveredClients := make(map[string]int) + + if prevArtifact != nil { + // we only mark clients as recovered if at some point they were failing health checks. + for client := range prevArtifact.FailingClientsReportedHash { + if _, stillFailing := result.FailingClientsReportedHash[client]; !stillFailing { + recoveredClients[client] = int(timeSinceChecksStarted.Seconds()) + } + } + + for client := range prevArtifact.FailingClientsReportedBlock { + if _, stillFailing := result.FailingClientsReportedBlock[client]; !stillFailing { + recoveredClients[client] = int(timeSinceChecksStarted.Seconds()) + } + } + + // merge previously recovered clients with the new + for k, v := range prevArtifact.NodeRecoveryTimeSeconds { + recoveredClients[k] = v + } } - // determine whether the nodes are in consensus - consensusBlockNum, wrongBlockNum, consensusBlockHash, wrongBlockHash := determineForkConsensus(forkChoice) - if len(wrongBlockNum) > 0 { - if maxAttempts > 0 { - log.Debugf("Nodes not at consensus for %s block. Waiting and re-trying in case we're on block propagation boundary. Attempts left: %d", blockType, maxAttempts-1) - time.Sleep(2 * time.Second) - return e.getBlockConsensus(ctx, clients, blockType, maxAttempts-1) - } else { - reportConsensusDataToLogger(blockType, consensusBlockNum, wrongBlockNum, consensusBlockHash, wrongBlockHash) + + didUnfaultedNodesNeedToRecover := false + for client := range recoveredClients { + if _, wasUnderTest := e.podsUnderTestLookup[client]; !wasUnderTest { + didUnfaultedNodesNeedToRecover = true } } - blockNumResult := &types.CheckResult{} - blockNumResult.TestName = fmt.Sprintf("All nodes agree on %s block number", blockType) - for _, node := range consensusBlockNum { - blockNumResult.PodsPassing = append(blockNumResult.PodsPassing, node.Pod.GetName()) + didUnfaultedNodesFail := false + for client := range result.FailingClientsReportedBlock { + if _, wasUnderTest := e.podsUnderTestLookup[client]; !wasUnderTest { + didUnfaultedNodesFail = true + } } - for _, node := range wrongBlockNum { - blockNumResult.PodsFailing = append(blockNumResult.PodsFailing, node.Pod.GetName()) + for client := range result.FailingClientsReportedHash { + if _, wasUnderTest := e.podsUnderTestLookup[client]; !wasUnderTest { + didUnfaultedNodesFail = true + } } - blockHashResult := &types.CheckResult{} - blockHashResult.TestName = fmt.Sprintf("All nodes agree on %s block hash", blockType) - for _, node := range consensusBlockHash { - blockHashResult.PodsPassing = append(blockHashResult.PodsPassing, node.Pod.GetName()) - } - for _, node := range wrongBlockHash { - blockHashResult.PodsFailing = append(blockHashResult.PodsFailing, node.Pod.GetName()) + return &types.BlockConsensusArtifact{ + BlockConsensusTestResult: result, + DidUnfaultedNodesFail: didUnfaultedNodesFail, + DidUnfaultedNodesNeedToRecover: didUnfaultedNodesNeedToRecover, + NodeRecoveryTimeSeconds: recoveredClients, } - reportConsensusDataToLogger(blockType, consensusBlockNum, wrongBlockNum, consensusBlockHash, wrongBlockHash) - return &getBlockConsensusResult{ - blockNumResult, - blockHashResult, - consensusBlockNum[0].BlockNumber, - consensusBlockHash[0].BlockHash, - }, nil } diff --git a/pkg/health/types/types.go b/pkg/health/types/types.go index 2a8a5b0..79a3e8d 100644 --- a/pkg/health/types/types.go +++ b/pkg/health/types/types.go @@ -3,12 +3,24 @@ package types import "context" type GenericNetworkChecker interface { - RunAllChecks(context.Context) ([]*CheckResult, error) + RunAllChecks(context.Context, *HealthCheckResult) (*HealthCheckResult, error) } -type CheckResult struct { - // think of a better struct later - TestName string - PodsPassing []string - PodsFailing []string +type BlockConsensusTestResult struct { + ConsensusBlock uint64 `yaml:"consensus_block"` + ConsensusHash string `yaml:"consensus_hash"` + FailingClientsReportedBlock map[string]uint64 `yaml:"failing_clients_reported_block"` + FailingClientsReportedHash map[string]string `yaml:"failing_clients_reported_hash"` +} + +type BlockConsensusArtifact struct { + *BlockConsensusTestResult `yaml:",inline"` + DidUnfaultedNodesFail bool `yaml:"did_unfaulted_nodes_fail"` + DidUnfaultedNodesNeedToRecover bool `yaml:"did_unfaulted_nodes_need_to_recover"` + NodeRecoveryTimeSeconds map[string]int `yaml:"node_recovery_time_seconds"` +} + +type HealthCheckResult struct { + LatestElBlockResult *BlockConsensusArtifact `yaml:"latest_el_block_health_result"` + FinalizedElBlockResult *BlockConsensusArtifact `yaml:"finalized_el_block_health_result"` } diff --git a/pkg/runtime.go b/pkg/runtime.go index 6b75125..fda9720 100644 --- a/pkg/runtime.go +++ b/pkg/runtime.go @@ -1,6 +1,7 @@ package pkg import ( + "attacknet/cmd/pkg/artifacts" chaos_mesh "attacknet/cmd/pkg/chaos-mesh" "attacknet/cmd/pkg/health" "attacknet/cmd/pkg/kubernetes" @@ -40,6 +41,8 @@ func StartTestSuite(ctx context.Context, cfg *types.ConfigParsed) error { log.Infof("Running %d tests", len(cfg.TestConfig.Tests)) + var testArtifacts []*artifacts.TestArtifact + for i, test := range cfg.TestConfig.Tests { log.Infof("Running test #%d, '%s'", i, test.TestName) executor := test_executor.CreateTestExecutor(chaosClient, test) @@ -66,84 +69,20 @@ func StartTestSuite(ctx context.Context, cfg *types.ConfigParsed) error { if err != nil { return err } - // todo: log here - _ = results + testArtifact := artifacts.BuildTestArtifact(results, podsUnderTest, test) + testArtifacts = append(testArtifacts, testArtifact) + if !testArtifact.TestPassed { + log.Warn("Some health checks failed. Stopping test suite.") + break + } } } + err = artifacts.SerializeTestArtifacts(testArtifacts) + if err != nil { + return err + } enclave.Destroy(ctx) return nil - /* - faultSession, err := chaosClient.StartFault(ctx, cfg.Tests[0].FaultSpec) - if err != nil { - grafanaTunnel.Cleanup(true) - return err - } - - // start core logic loop here. - err = waitForInjectionCompleted(ctx, faultSession) - if err != nil { - grafanaTunnel.Cleanup(true) - return err - } - var timeToSleep time.Duration - if faultSession.TestDuration != nil { - durationSeconds := int(faultSession.TestDuration.Seconds()) - log.Infof("Fault injected successfully. Fault will run for %d seconds before recovering.", durationSeconds) - timeToSleep = *faultSession.TestDuration - } else { - log.Infof("Fault injected successfully. This fault has no specific duration.") - } - time.Sleep(timeToSleep) - - // we can build the health checker once the fault is injected - log.Info("creating health checker") - hc, err := health.BuildHealthChecker(cfg, kubeClient, faultSession.PodsUnderTest) - if err != nil { - return err - } - _ = hc - - err = waitForFaultRecovery(ctx, faultSession) - if err != nil { - grafanaTunnel.Cleanup(true) - return err - } - - _, err = hc.RunChecksUntilTimeout(ctx) - - return err*/ -} - -// todo: move to fault session? -/* - - -func waitForFaultRecovery(ctx context.Context, session *chaos_mesh.FaultSession) error { - for { - status, err := session.GetStatus(ctx) - if err != nil { - return err - } - - switch status { - case chaos_mesh.InProgress: - log.Infof("The fault is still finishing up. Sleeping for 10s") - time.Sleep(10 * time.Second) - case chaos_mesh.Stopping: - log.Infof("The fault is being stopped. Sleeping for 10s") - time.Sleep(10 * time.Second) - case chaos_mesh.Error: - log.Errorf("there was an error returned by chaos-mesh") - return errors.New("there was an unspecified error returned by chaos-mesh. inspect the fault resource") - case chaos_mesh.Completed: - log.Infof("The fault terminated successfully!") - return nil - default: - return stacktrace.NewError("unknown chaos session state %s", status) - } - // todo: add timeout break if no changes in k8s resource after fault duration elapses - } } -*/ From a47b28908e5bcdd779d773e95f97ad19fa2dace8 Mon Sep 17 00:00:00 2001 From: Benjamin Samuels <1222451+bsamuels453@users.noreply.github.com> Date: Thu, 11 Jan 2024 08:31:49 -0800 Subject: [PATCH 2/6] improve logging --- pkg/runtime.go | 2 +- pkg/test_executor/executor.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/runtime.go b/pkg/runtime.go index fda9720..7f7dd47 100644 --- a/pkg/runtime.go +++ b/pkg/runtime.go @@ -44,7 +44,7 @@ func StartTestSuite(ctx context.Context, cfg *types.ConfigParsed) error { var testArtifacts []*artifacts.TestArtifact for i, test := range cfg.TestConfig.Tests { - log.Infof("Running test #%d, '%s'", i, test.TestName) + log.Infof("Running test (%d/%d): '%s'", i, len(cfg.TestConfig.Tests), test.TestName) executor := test_executor.CreateTestExecutor(chaosClient, test) err = executor.RunTestPlan(ctx) diff --git a/pkg/test_executor/executor.go b/pkg/test_executor/executor.go index fe03bbf..b160889 100644 --- a/pkg/test_executor/executor.go +++ b/pkg/test_executor/executor.go @@ -27,12 +27,12 @@ func (te *TestExecutor) RunTestPlan(ctx context.Context) error { if te.planCompleted { return stacktrace.NewError("test executor %s has already been run", te.testName) } - for _, genericStep := range te.planSteps { + for i, genericStep := range te.planSteps { marshalledSpec, err := yaml.Marshal(genericStep.Spec) if err != nil { return stacktrace.Propagate(err, "could not marshal plan step %s", genericStep.Spec) } - log.Infof("Running test step '%s'", genericStep.StepDescription) + log.Infof("Running test step (%d/%d): '%s'", i, len(te.planSteps), genericStep.StepDescription) switch genericStep.StepType { case types.InjectFault: var s PlanStepSingleFault From 0c70d7a73a04fb21b4db4cbd2c9bd15ffcf9ac2e Mon Sep 17 00:00:00 2001 From: Benjamin Samuels <1222451+bsamuels453@users.noreply.github.com> Date: Thu, 11 Jan 2024 08:47:52 -0800 Subject: [PATCH 3/6] off by one bug --- pkg/runtime.go | 2 +- pkg/test_executor/executor.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/runtime.go b/pkg/runtime.go index 7f7dd47..2423e07 100644 --- a/pkg/runtime.go +++ b/pkg/runtime.go @@ -44,7 +44,7 @@ func StartTestSuite(ctx context.Context, cfg *types.ConfigParsed) error { var testArtifacts []*artifacts.TestArtifact for i, test := range cfg.TestConfig.Tests { - log.Infof("Running test (%d/%d): '%s'", i, len(cfg.TestConfig.Tests), test.TestName) + log.Infof("Running test (%d/%d): '%s'", i+1, len(cfg.TestConfig.Tests), test.TestName) executor := test_executor.CreateTestExecutor(chaosClient, test) err = executor.RunTestPlan(ctx) diff --git a/pkg/test_executor/executor.go b/pkg/test_executor/executor.go index b160889..5354086 100644 --- a/pkg/test_executor/executor.go +++ b/pkg/test_executor/executor.go @@ -32,7 +32,7 @@ func (te *TestExecutor) RunTestPlan(ctx context.Context) error { if err != nil { return stacktrace.Propagate(err, "could not marshal plan step %s", genericStep.Spec) } - log.Infof("Running test step (%d/%d): '%s'", i, len(te.planSteps), genericStep.StepDescription) + log.Infof("Running test step (%d/%d): '%s'", i+1, len(te.planSteps), genericStep.StepDescription) switch genericStep.StepType { case types.InjectFault: var s PlanStepSingleFault From e9f01c212411c09c745299313c490651a1738c6f Mon Sep 17 00:00:00 2001 From: Benjamin Samuels <1222451+bsamuels453@users.noreply.github.com> Date: Thu, 11 Jan 2024 08:53:16 -0800 Subject: [PATCH 4/6] maker grace period and initial wait time configurable for plans --- pkg/plan/plan.go | 74 +--------------------------- pkg/plan/suite/faults.go | 2 +- pkg/plan/suite/suite_builder.go | 44 ++++++++++++++--- pkg/plan/suite/test_builder.go | 11 ++--- pkg/plan/suite/types.go | 3 ++ planner-configs/clock-skew-reth.yaml | 7 ++- 6 files changed, 53 insertions(+), 88 deletions(-) diff --git a/pkg/plan/plan.go b/pkg/plan/plan.go index 3cbb979..38d414f 100644 --- a/pkg/plan/plan.go +++ b/pkg/plan/plan.go @@ -30,7 +30,7 @@ func BuildPlan(planName string, config *PlannerConfig) error { attacknetConfig = types.AttacknetConfig{ GrafanaPodName: "grafana", GrafanaPodPort: "3000", - WaitBeforeInjectionSeconds: 0, + WaitBeforeInjectionSeconds: uint32(config.FaultConfig.WaitBeforeFirstTest.Seconds()), ReuseDevnetBetweenRuns: true, AllowPostFaultInspection: false, } @@ -38,7 +38,7 @@ func BuildPlan(planName string, config *PlannerConfig) error { attacknetConfig = types.AttacknetConfig{ GrafanaPodName: "grafana", GrafanaPodPort: "3000", - WaitBeforeInjectionSeconds: 0, + WaitBeforeInjectionSeconds: uint32(config.FaultConfig.WaitBeforeFirstTest.Seconds()), ReuseDevnetBetweenRuns: true, ExistingDevnetNamespace: config.KubernetesNamespace, AllowPostFaultInspection: false, @@ -67,73 +67,3 @@ func BuildPlan(planName string, config *PlannerConfig) error { return writePlans(netConfigPath, suiteConfigPath, networkConfig, suiteConfig) } - -/* - run time delay on various el/cl combos - -> each target exists in the same suite/network - - run time delay on group of el-cl nodes that use the same CL or EL - -> network minority - -> 33+ but less than 66% - - re-org on group of el-cl nodes that use the same CL or EL - - there's two steps, identifying targets, and creating the manifest for the target/test types - - targeting criteria types: - - percentages of the validator set (32, 33, 34, 50, 65)% - - subcategories: by node vs. by client - - target by client - - a specific node containing an instance of the client - - all nodes containing an instance of the client - - a specific instance of the client - - all instances of the client - - subcategories: target node or target client by criterion - - - clock skew - - extra varies: - - clock skew nodes by EL - - clock skew nodes by CL - - criterion: percentage(client, node), target by client(client, node) - - - restarts - - these restarts require resync - - criterion: percentages(client, node), target by client(client, node) - - network bandwidth - - extra varies: - - the amount of bandwidth - - whether the constraint is EL<-CL or node <-> network - - percentages - - client criterion (although not all client selections will be valid) - - network split - - percentages - - client criterion - - packet drop - - extra varies: loss pct, correlation - - latency - - extra varies: latency amount, correlation - - percentages (although includes 100%) - - clients (both type?) - - syncing faults - -> restart node, force to sync. inject fault while syncing. this impacts checkpoint sync probably too. - - packet corruption - - - each test builder needs a way to reject input corpus - eventually we'll want a way to block known bad inputs (ie: lodestar doesnt seem to re-establish peers correctly) - anotehr example: - - actual tasks: - - implement plan builder for each concept - - selector := buildParamsForNodeFault(node) -*/ -//return nil diff --git a/pkg/plan/suite/faults.go b/pkg/plan/suite/faults.go index e7bac06..e42f5d1 100644 --- a/pkg/plan/suite/faults.go +++ b/pkg/plan/suite/faults.go @@ -107,7 +107,7 @@ func buildPodRestartFault(description string, expressionSelectors []ChaosExpress Kind: "PodChaos", ApiVersion: "chaos-mesh.org/v1alpha1", Spec: PodChaosSpec{ - Duration: "10s", + Duration: "1s", Mode: "all", Selector: Selector{ ExpressionSelectors: expressionSelectors, diff --git a/pkg/plan/suite/suite_builder.go b/pkg/plan/suite/suite_builder.go index db33cb2..1b56d94 100644 --- a/pkg/plan/suite/suite_builder.go +++ b/pkg/plan/suite/suite_builder.go @@ -52,8 +52,19 @@ func ComposeTestSuite( runtimeEstimate += int(d.Seconds()) } } + var targetingDescription string + if targetDimension == TargetMatchingNode { + targetingDescription = fmt.Sprintf("Impacting the full node of targeted %s clients. Injecting into %s of the matching targets.", config.TargetClient, attackSize) + } else { + targetingDescription = fmt.Sprintf("Impacting the client of targeted %s clients. Injecting into %s of the matching targets.", config.TargetClient, attackSize) + } - test, err := composeTestsForFaultType(config.FaultType, faultConfig, targetSelectors) + test, err := composeTestForFaultType( + config.FaultType, + faultConfig, + targetSelectors, + targetingDescription, + ) if err != nil { return nil, err } @@ -67,10 +78,12 @@ func ComposeTestSuite( return tests, nil } -func composeTestsForFaultType( +func composeTestForFaultType( faultType FaultTypeEnum, config map[string]string, - targetSelectors []*ChaosTargetSelector) (*types.SuiteTest, error) { + targetSelectors []*ChaosTargetSelector, + targetingDescription string, +) (*types.SuiteTest, error) { switch faultType { case FaultClockSkew: @@ -82,11 +95,28 @@ func composeTestsForFaultType( if !ok { return nil, stacktrace.NewError("missing duration field for clock skew fault") } - description := fmt.Sprintf("Apply %s clock skew for %s against %d targets", skew, duration, len(targetSelectors)) - return composeNodeClockSkewTest(description, targetSelectors, skew, duration) + grace, ok := config["grace_period"] + if !ok { + return nil, stacktrace.NewError("missing grace_period field for clock skew fault") + } + graceDuration, err := time.ParseDuration(grace) + if err != nil { + return nil, stacktrace.NewError("unable to convert grace_period field to a time duration for clock skew fault") + } + + description := fmt.Sprintf("Apply %s clock skew for %s against %d targets. %s", skew, duration, len(targetSelectors), targetingDescription) + return composeNodeClockSkewTest(description, targetSelectors, skew, duration, graceDuration) case FaultContainerRestart: - description := fmt.Sprintf("Restarting %d targets", len(targetSelectors)) - return composeNodeRestartTest(description, targetSelectors) + grace, ok := config["grace_period"] + if !ok { + return nil, stacktrace.NewError("missing grace_period field for restsrt fault") + } + graceDuration, err := time.ParseDuration(grace) + if err != nil { + return nil, stacktrace.NewError("unable to convert grace_period field to a time duration for clock skew fault") + } + description := fmt.Sprintf("Restarting %d targets. %s", len(targetSelectors), targetingDescription) + return composeNodeRestartTest(description, targetSelectors, graceDuration) } return nil, nil diff --git a/pkg/plan/suite/test_builder.go b/pkg/plan/suite/test_builder.go index a1fab8d..6a41b55 100644 --- a/pkg/plan/suite/test_builder.go +++ b/pkg/plan/suite/test_builder.go @@ -5,10 +5,7 @@ import ( "time" ) -const clockSkewGracePeriod = time.Second * 1800 -const containerRestartGracePeriod = time.Second * 3600 - -func composeNodeClockSkewTest(description string, targets []*ChaosTargetSelector, skew, duration string) (*types.SuiteTest, error) { +func composeNodeClockSkewTest(description string, targets []*ChaosTargetSelector, skew, duration string, graceDuration time.Duration) (*types.SuiteTest, error) { var steps []types.PlanStep s, err := composeNodeClockSkewPlanSteps(targets, skew, duration) if err != nil { @@ -24,14 +21,14 @@ func composeNodeClockSkewTest(description string, targets []*ChaosTargetSelector PlanSteps: steps, HealthConfig: types.HealthCheckConfig{ EnableChecks: true, - GracePeriod: clockSkewGracePeriod, + GracePeriod: graceDuration, }, } return test, nil } -func composeNodeRestartTest(description string, targets []*ChaosTargetSelector) (*types.SuiteTest, error) { +func composeNodeRestartTest(description string, targets []*ChaosTargetSelector, graceDuration time.Duration) (*types.SuiteTest, error) { var steps []types.PlanStep s, err := composeNodeRestartSteps(targets) @@ -48,7 +45,7 @@ func composeNodeRestartTest(description string, targets []*ChaosTargetSelector) PlanSteps: steps, HealthConfig: types.HealthCheckConfig{ EnableChecks: true, - GracePeriod: containerRestartGracePeriod, + GracePeriod: graceDuration, }, } diff --git a/pkg/plan/suite/types.go b/pkg/plan/suite/types.go index 837b143..1dfe191 100644 --- a/pkg/plan/suite/types.go +++ b/pkg/plan/suite/types.go @@ -1,5 +1,7 @@ package suite +import "time" + type TargetingSpec string const ( @@ -49,6 +51,7 @@ var FaultTypes = map[FaultTypeEnum]bool{ type PlannerFaultConfiguration struct { FaultType FaultTypeEnum `yaml:"fault_type"` TargetClient string `yaml:"target_client"` + WaitBeforeFirstTest time.Duration `yaml:"wait_before_first_test"` FaultConfigDimensions []map[string]string `yaml:"fault_config_dimensions"` TargetingDimensions []TargetingSpec `yaml:"fault_targeting_dimensions"` AttackSizeDimensions []AttackSize `yaml:"fault_attack_size_dimensions"` diff --git a/planner-configs/clock-skew-reth.yaml b/planner-configs/clock-skew-reth.yaml index 27eeb3e..ce00c42 100644 --- a/planner-configs/clock-skew-reth.yaml +++ b/planner-configs/clock-skew-reth.yaml @@ -32,9 +32,14 @@ kubernetes_namespace: kt-ethereum fault_config: fault_type: ClockSkew target_client: reth + wait_before_first_test: 300s fault_config_dimensions: - skew: -2m - duration: 10m + duration: 1m + grace_period: 1800s + - skew: 2m + duration: 1m + grace_period: 1800s fault_targeting_dimensions: - MatchingNode - MatchingClient From 41259297e8c59b0d4f88eb8cadb5578bff0c2010 Mon Sep 17 00:00:00 2001 From: Benjamin Samuels <1222451+bsamuels453@users.noreply.github.com> Date: Thu, 11 Jan 2024 08:55:58 -0800 Subject: [PATCH 5/6] increase health check retries --- pkg/health/ethereum/network_checker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/health/ethereum/network_checker.go b/pkg/health/ethereum/network_checker.go index 2ab3687..93fad68 100644 --- a/pkg/health/ethereum/network_checker.go +++ b/pkg/health/ethereum/network_checker.go @@ -39,7 +39,7 @@ func (e *EthNetworkChecker) RunAllChecks(ctx context.Context, prevHealthCheckRes } log.Debug("Ready to query for health checks") - latestResult, err := e.getExecBlockConsensus(ctx, execRpcClients, "latest", 5) + latestResult, err := e.getExecBlockConsensus(ctx, execRpcClients, "latest", 15) if err != nil { return nil, err } From 8a41c3c908d248cab13a0e276773bb38701199bc Mon Sep 17 00:00:00 2001 From: Benjamin Samuels <1222451+bsamuels453@users.noreply.github.com> Date: Thu, 11 Jan 2024 08:56:16 -0800 Subject: [PATCH 6/6] update sample test suites --- network-configs/plan/test.yaml | 88 ------- network-configs/plan/testing.yaml | 20 +- test-suites/plan/reth-reorg.yaml | 18 -- test-suites/plan/test.yaml | 38 --- test-suites/plan/testing.yaml | 369 +++++++++++++++++++++++++++--- test-suites/suite.yaml | 4 +- 6 files changed, 352 insertions(+), 185 deletions(-) delete mode 100644 network-configs/plan/test.yaml delete mode 100644 test-suites/plan/reth-reorg.yaml delete mode 100644 test-suites/plan/test.yaml diff --git a/network-configs/plan/test.yaml b/network-configs/plan/test.yaml deleted file mode 100644 index 496a0a5..0000000 --- a/network-configs/plan/test.yaml +++ /dev/null @@ -1,88 +0,0 @@ -participants: - - el_client_type: geth - el_client_image: ethereum/client-go:latest - cl_client_type: lighthouse - cl_client_image: sigp/lighthouse:latest - el_min_cpu: 1000 - el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 - bn_min_cpu: 1000 - bn_max_cpu: 1000 - bn_min_mem: 2048 - bn_max_mem: 2048 - v_min_cpu: 1000 - v_max_cpu: 1000 - v_min_mem: 1024 - v_max_mem: 1024 - count: 1 - - el_client_type: reth - el_client_image: ghcr.io/paradigmxyz/reth:v0.1.0-alpha.13 - cl_client_type: lighthouse - cl_client_image: sigp/lighthouse:latest - el_min_cpu: 1000 - el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 - bn_min_cpu: 1000 - bn_max_cpu: 1000 - bn_min_mem: 2048 - bn_max_mem: 2048 - v_min_cpu: 1000 - v_max_cpu: 1000 - v_min_mem: 1024 - v_max_mem: 1024 - count: 1 - - el_client_type: reth - el_client_image: ghcr.io/paradigmxyz/reth:v0.1.0-alpha.13 - cl_client_type: prysm - cl_client_image: prysmaticlabs/prysm-beacon-chain:latest,prysmaticlabs/prysm-validator:latest - el_min_cpu: 1000 - el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 - bn_min_cpu: 1000 - bn_max_cpu: 1000 - bn_min_mem: 2048 - bn_max_mem: 2048 - v_min_cpu: 1000 - v_max_cpu: 1000 - v_min_mem: 1024 - v_max_mem: 1024 - count: 1 - - el_client_type: reth - el_client_image: ghcr.io/paradigmxyz/reth:v0.1.0-alpha.13 - cl_client_type: teku - cl_client_image: consensys/teku:23.12.0 - el_min_cpu: 1000 - el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 - bn_min_cpu: 1000 - bn_max_cpu: 1000 - bn_min_mem: 2048 - bn_max_mem: 2048 - count: 1 - - el_client_type: reth - el_client_image: ghcr.io/paradigmxyz/reth:v0.1.0-alpha.13 - cl_client_type: lodestar - cl_client_image: chainsafe/lodestar:v1.12.1 - el_min_cpu: 1000 - el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 - bn_min_cpu: 1000 - bn_max_cpu: 1000 - bn_min_mem: 2048 - bn_max_mem: 2048 - v_min_cpu: 1000 - v_max_cpu: 1000 - v_min_mem: 1024 - v_max_mem: 1024 - count: 1 -network_params: - num_validator_keys_per_node: "32" -additional_services: - - prometheus_grafana - - dora -parallel_keystore_generation: false diff --git a/network-configs/plan/testing.yaml b/network-configs/plan/testing.yaml index 53e6b9d..1eeaa1f 100644 --- a/network-configs/plan/testing.yaml +++ b/network-configs/plan/testing.yaml @@ -5,8 +5,8 @@ participants: cl_client_image: sigp/lighthouse:latest el_min_cpu: 1000 el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 + el_min_mem: 1024 + el_max_mem: 1024 bn_min_cpu: 1000 bn_max_cpu: 1000 bn_min_mem: 2048 @@ -22,8 +22,8 @@ participants: cl_client_image: consensys/teku:23.12.0 el_min_cpu: 1000 el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 + el_min_mem: 1024 + el_max_mem: 1024 bn_min_cpu: 1000 bn_max_cpu: 1000 bn_min_mem: 2048 @@ -35,8 +35,8 @@ participants: cl_client_image: chainsafe/lodestar:v1.12.1 el_min_cpu: 1000 el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 + el_min_mem: 1024 + el_max_mem: 1024 bn_min_cpu: 1000 bn_max_cpu: 1000 bn_min_mem: 2048 @@ -52,8 +52,8 @@ participants: cl_client_image: sigp/lighthouse:latest el_min_cpu: 1000 el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 + el_min_mem: 1024 + el_max_mem: 1024 bn_min_cpu: 1000 bn_max_cpu: 1000 bn_min_mem: 2048 @@ -69,8 +69,8 @@ participants: cl_client_image: prysmaticlabs/prysm-beacon-chain:latest,prysmaticlabs/prysm-validator:latest el_min_cpu: 1000 el_max_cpu: 1000 - el_min_mem: 2048 - el_max_mem: 2048 + el_min_mem: 1024 + el_max_mem: 1024 bn_min_cpu: 1000 bn_max_cpu: 1000 bn_min_mem: 2048 diff --git a/test-suites/plan/reth-reorg.yaml b/test-suites/plan/reth-reorg.yaml deleted file mode 100644 index fede38c..0000000 --- a/test-suites/plan/reth-reorg.yaml +++ /dev/null @@ -1,18 +0,0 @@ -attacknetConfig: - grafanaPodName: grafana - grafanaPodPort: "3000" - allowPostFaultInspection: false - waitBeforeInjectionSeconds: 60 - reuseDevnetBetweenRuns: true - existingDevnetNamespace: kt-ethereum -harnessConfig: - networkType: ethereum - networkPackage: github.com/kurtosis-tech/ethereum-package - networkConfig: reth.yaml -testConfig: - tests: - - testName: clock skew - planSteps: [] - health: - enableChecks: true - gracePeriod: 2m0s diff --git a/test-suites/plan/test.yaml b/test-suites/plan/test.yaml deleted file mode 100644 index 52c5adf..0000000 --- a/test-suites/plan/test.yaml +++ /dev/null @@ -1,38 +0,0 @@ -attacknetConfig: - grafanaPodName: grafana - grafanaPodPort: "3000" - allowPostFaultInspection: false - waitBeforeInjectionSeconds: 0 - reuseDevnetBetweenRuns: true - existingDevnetNamespace: kt-ethereum -harnessConfig: - networkType: ethereum - networkPackage: github.com/kurtosis-tech/ethereum-package - networkConfig: plan/test.yaml -testConfig: - tests: - - testName: clock skew - health: - enableChecks: true - gracePeriod: 2m0s - planSteps: - - stepType: injectFault - description: 'Inject clock skew on target reth/teku Node (Node #3)' - chaosFaultSpec: - apiVersion: chaos-mesh.org/v1alpha1 - kind: TimeChaos - spec: - action: delay - duration: 1m - mode: all - selector: - expressionSelectors: - - key: kurtosistech.com/id - operator: In - values: - - el-3-reth-teku - - cl-3-teku-reth - timeOffset: -5m - - stepType: waitForFaultCompletion - description: wait for faults to terminate - diff --git a/test-suites/plan/testing.yaml b/test-suites/plan/testing.yaml index f007377..7be7ea8 100644 --- a/test-suites/plan/testing.yaml +++ b/test-suites/plan/testing.yaml @@ -2,7 +2,7 @@ attacknetConfig: grafanaPodName: grafana grafanaPodPort: "3000" allowPostFaultInspection: false - waitBeforeInjectionSeconds: 0 + waitBeforeInjectionSeconds: 300 reuseDevnetBetweenRuns: true existingDevnetNamespace: kt-ethereum harnessConfig: @@ -11,7 +11,7 @@ harnessConfig: networkConfig: plan/testing.yaml testConfig: tests: - - testName: Apply -2m clock skew for 10m against 1 targets + - testName: Apply -2m clock skew for 1m against 1 targets. Impacting the full node of targeted reth clients. Injecting into AttackOneMatching of the matching targets. planSteps: - stepType: injectFault description: 'Inject clock skew on target reth/teku Node (Node #2)' @@ -20,7 +20,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -34,8 +34,8 @@ testConfig: description: wait for faults to terminate health: enableChecks: true - gracePeriod: 2m0s - - testName: Apply -2m clock skew for 10m against 3 targets + gracePeriod: 30m0s + - testName: Apply 2m clock skew for 1m against 1 targets. Impacting the full node of targeted reth clients. Injecting into AttackOneMatching of the matching targets. planSteps: - stepType: injectFault description: 'Inject clock skew on target reth/teku Node (Node #2)' @@ -44,7 +44,31 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-2-reth-teku + - cl-2-teku-reth + timeOffset: 2m + - stepType: waitForFaultCompletion + description: wait for faults to terminate + health: + enableChecks: true + gracePeriod: 30m0s + - testName: Apply -2m clock skew for 1m against 3 targets. Impacting the full node of targeted reth clients. Injecting into AttackSupermajorityMatching of the matching targets. + planSteps: + - stepType: injectFault + description: 'Inject clock skew on target reth/teku Node (Node #2)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m mode: all selector: expressionSelectors: @@ -61,7 +85,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -79,7 +103,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -94,8 +118,68 @@ testConfig: description: wait for faults to terminate health: enableChecks: true - gracePeriod: 2m0s - - testName: Apply -2m clock skew for 10m against 4 targets + gracePeriod: 30m0s + - testName: Apply 2m clock skew for 1m against 3 targets. Impacting the full node of targeted reth clients. Injecting into AttackSupermajorityMatching of the matching targets. + planSteps: + - stepType: injectFault + description: 'Inject clock skew on target reth/teku Node (Node #2)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-2-reth-teku + - cl-2-teku-reth + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth/lodestar Node (Node #3)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-3-reth-lodestar + - cl-3-lodestar-reth + - cl-3-lodestar-reth-validator + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth/lighthouse Node (Node #4)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-4-reth-lighthouse + - cl-4-lighthouse-reth + - cl-4-lighthouse-reth-validator + timeOffset: 2m + - stepType: waitForFaultCompletion + description: wait for faults to terminate + health: + enableChecks: true + gracePeriod: 30m0s + - testName: Apply -2m clock skew for 1m against 4 targets. Impacting the full node of targeted reth clients. Injecting into AttackAllMatching of the matching targets. planSteps: - stepType: injectFault description: 'Inject clock skew on target reth/teku Node (Node #2)' @@ -104,7 +188,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -121,7 +205,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -139,7 +223,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -157,7 +241,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -172,8 +256,86 @@ testConfig: description: wait for faults to terminate health: enableChecks: true - gracePeriod: 2m0s - - testName: Apply -2m clock skew for 10m against 1 targets + gracePeriod: 30m0s + - testName: Apply 2m clock skew for 1m against 4 targets. Impacting the full node of targeted reth clients. Injecting into AttackAllMatching of the matching targets. + planSteps: + - stepType: injectFault + description: 'Inject clock skew on target reth/teku Node (Node #2)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-2-reth-teku + - cl-2-teku-reth + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth/lodestar Node (Node #3)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-3-reth-lodestar + - cl-3-lodestar-reth + - cl-3-lodestar-reth-validator + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth/lighthouse Node (Node #4)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-4-reth-lighthouse + - cl-4-lighthouse-reth + - cl-4-lighthouse-reth-validator + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth/prysm Node (Node #5)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-5-reth-prysm + - cl-5-prysm-reth + - cl-5-prysm-reth-validator + timeOffset: 2m + - stepType: waitForFaultCompletion + description: wait for faults to terminate + health: + enableChecks: true + gracePeriod: 30m0s + - testName: Apply -2m clock skew for 1m against 1 targets. Impacting the client of targeted reth clients. Injecting into AttackOneMatching of the matching targets. planSteps: - stepType: injectFault description: 'Inject clock skew on target reth client of reth/teku Node (Node #2)' @@ -182,7 +344,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -195,8 +357,31 @@ testConfig: description: wait for faults to terminate health: enableChecks: true - gracePeriod: 2m0s - - testName: Apply -2m clock skew for 10m against 3 targets + gracePeriod: 30m0s + - testName: Apply 2m clock skew for 1m against 1 targets. Impacting the client of targeted reth clients. Injecting into AttackOneMatching of the matching targets. + planSteps: + - stepType: injectFault + description: 'Inject clock skew on target reth client of reth/teku Node (Node #2)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-2-reth-teku + timeOffset: 2m + - stepType: waitForFaultCompletion + description: wait for faults to terminate + health: + enableChecks: true + gracePeriod: 30m0s + - testName: Apply -2m clock skew for 1m against 3 targets. Impacting the client of targeted reth clients. Injecting into AttackSupermajorityMatching of the matching targets. planSteps: - stepType: injectFault description: 'Inject clock skew on target reth client of reth/teku Node (Node #2)' @@ -205,7 +390,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -221,7 +406,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -237,7 +422,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -250,8 +435,8 @@ testConfig: description: wait for faults to terminate health: enableChecks: true - gracePeriod: 2m0s - - testName: Apply -2m clock skew for 10m against 4 targets + gracePeriod: 30m0s + - testName: Apply 2m clock skew for 1m against 3 targets. Impacting the client of targeted reth clients. Injecting into AttackSupermajorityMatching of the matching targets. planSteps: - stepType: injectFault description: 'Inject clock skew on target reth client of reth/teku Node (Node #2)' @@ -260,7 +445,62 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-2-reth-teku + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth client of reth/lodestar Node (Node #3)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-3-reth-lodestar + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth client of reth/lighthouse Node (Node #4)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-4-reth-lighthouse + timeOffset: 2m + - stepType: waitForFaultCompletion + description: wait for faults to terminate + health: + enableChecks: true + gracePeriod: 30m0s + - testName: Apply -2m clock skew for 1m against 4 targets. Impacting the client of targeted reth clients. Injecting into AttackAllMatching of the matching targets. + planSteps: + - stepType: injectFault + description: 'Inject clock skew on target reth client of reth/teku Node (Node #2)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m mode: all selector: expressionSelectors: @@ -276,7 +516,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -292,7 +532,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -308,7 +548,7 @@ testConfig: kind: TimeChaos spec: action: delay - duration: 10m + duration: 1m mode: all selector: expressionSelectors: @@ -321,4 +561,75 @@ testConfig: description: wait for faults to terminate health: enableChecks: true - gracePeriod: 2m0s + gracePeriod: 30m0s + - testName: Apply 2m clock skew for 1m against 4 targets. Impacting the client of targeted reth clients. Injecting into AttackAllMatching of the matching targets. + planSteps: + - stepType: injectFault + description: 'Inject clock skew on target reth client of reth/teku Node (Node #2)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-2-reth-teku + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth client of reth/lodestar Node (Node #3)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-3-reth-lodestar + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth client of reth/lighthouse Node (Node #4)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-4-reth-lighthouse + timeOffset: 2m + - stepType: injectFault + description: 'Inject clock skew on target reth client of reth/prysm Node (Node #5)' + chaosFaultSpec: + apiVersion: chaos-mesh.org/v1alpha1 + kind: TimeChaos + spec: + action: delay + duration: 1m + mode: all + selector: + expressionSelectors: + - key: kurtosistech.com/id + operator: In + values: + - el-5-reth-prysm + timeOffset: 2m + - stepType: waitForFaultCompletion + description: wait for faults to terminate + health: + enableChecks: true + gracePeriod: 30m0s diff --git a/test-suites/suite.yaml b/test-suites/suite.yaml index 43e64a2..be249be 100644 --- a/test-suites/suite.yaml +++ b/test-suites/suite.yaml @@ -15,7 +15,7 @@ testConfig: - testName: clock-skew health: enableChecks: true - gracePeriod: 2m0s + gracePeriod: 30s planSteps: - stepType: injectFault description: 'Inject clock skew on target' @@ -30,6 +30,6 @@ testConfig: mode: all action: delay timeOffset: '-5m' - duration: 10m + duration: 1m - stepType: waitForFaultCompletion description: wait for faults to terminate