Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adding error handling to spans (Distributed Tracing) #728

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions bin/experiment/experiment.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"errors"
"flag"
"fmt"
"os"

// Uncomment to load all auth plugins
Expand Down Expand Up @@ -68,6 +69,7 @@ import (
"github.com/litmuschaos/litmus-go/pkg/telemetry"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/codes"
)

func init() {
Expand Down Expand Up @@ -106,6 +108,8 @@ func main() {
//Getting kubeConfig and Generate ClientSets
if err := clients.GenerateClientSetFromKubeConfig(); err != nil {
log.Errorf("Unable to Get the kubeconfig, err: %v", err)
span.SetStatus(codes.Error, "Unable to Get the kubeconfig")
span.RecordError(err)
return
}

Expand Down Expand Up @@ -211,6 +215,7 @@ func main() {
k6Loadgen.Experiment(ctx, clients)
default:
log.Errorf("Unsupported -name %v, please provide the correct value of -name args", *experimentName)
span.SetStatus(codes.Error, fmt.Sprintf("Unsupported -name %v", *experimentName))
return
}
}
42 changes: 39 additions & 3 deletions chaoslib/litmus/pod-delete/lib/pod-delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/palantir/stacktrace"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/codes"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand All @@ -46,14 +47,22 @@ func PreparePodDelete(ctx context.Context, experimentsDetails *experimentTypes.E
switch strings.ToLower(experimentsDetails.Sequence) {
case "serial":
if err := injectChaosInSerialMode(ctx, experimentsDetails, clients, chaosDetails, eventsDetails, resultDetails); err != nil {
span.SetStatus(codes.Error, "could not run chaos in serial mode")
span.RecordError(err)
return stacktrace.Propagate(err, "could not run chaos in serial mode")
}
case "parallel":
if err := injectChaosInParallelMode(ctx, experimentsDetails, clients, chaosDetails, eventsDetails, resultDetails); err != nil {
span.SetStatus(codes.Error, "could not run chaos in parallel mode")
span.RecordError(err)
return stacktrace.Propagate(err, "could not run chaos in parallel mode")
}
default:
return cerrors.Error{ErrorCode: cerrors.ErrorTypeGeneric, Reason: fmt.Sprintf("'%s' sequence is not supported", experimentsDetails.Sequence)}
errReason := fmt.Sprintf("sequence '%s' is not supported", experimentsDetails.Sequence)
span.SetStatus(codes.Error, errReason)
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeGeneric, Reason: errReason}
span.RecordError(err)
return err
}

//Waiting for the ramp time after chaos injection
Expand All @@ -72,6 +81,8 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
// run the probes during chaos
if len(resultDetails.ProbeDetails) != 0 {
if err := probe.RunProbes(ctx, chaosDetails, clients, resultDetails, "DuringChaos", eventsDetails); err != nil {
span.SetStatus(codes.Error, "could not run the probes during chaos")
span.RecordError(err)
return err
}
}
Expand All @@ -85,18 +96,25 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
// Get the target pod details for the chaos execution
// if the target pod is not defined it will derive the random target pod list using pod affected percentage
if experimentsDetails.TargetPods == "" && chaosDetails.AppDetail == nil {
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "provide one of the appLabel or TARGET_PODS"}
span.SetStatus(codes.Error, "provide one of the appLabel or TARGET_PODS")
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "provide one of the appLabel or TARGET_PODS"}
span.RecordError(err)
return err
}

targetPodList, err := common.GetTargetPods(experimentsDetails.NodeLabel, experimentsDetails.TargetPods, experimentsDetails.PodsAffectedPerc, clients, chaosDetails)
if err != nil {
span.SetStatus(codes.Error, "could not get target pods")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get target pods")
}

// deriving the parent name of the target resources
for _, pod := range targetPodList.Items {
kind, parentName, err := workloads.GetPodOwnerTypeAndName(&pod, clients.DynamicClient)
if err != nil {
span.SetStatus(codes.Error, "could not get pod owner name and kind")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get pod owner name and kind")
}
common.SetParentName(parentName, kind, pod.Namespace, chaosDetails)
Expand All @@ -123,12 +141,16 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
err = clients.KubeClient.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, v1.DeleteOptions{})
}
if err != nil {
span.SetStatus(codes.Error, "could not delete the target pod")
span.RecordError(err)
return cerrors.Error{ErrorCode: cerrors.ErrorTypeChaosInject, Target: fmt.Sprintf("{podName: %s, namespace: %s}", pod.Name, pod.Namespace), Reason: fmt.Sprintf("failed to delete the target pod: %s", err.Error())}
}

switch chaosDetails.Randomness {
case true:
if err := common.RandomInterval(experimentsDetails.ChaosInterval); err != nil {
span.SetStatus(codes.Error, "could not get random chaos interval")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get random chaos interval")
}
default:
Expand All @@ -149,6 +171,8 @@ func injectChaosInSerialMode(ctx context.Context, experimentsDetails *experiment
Namespace: parent.Namespace,
}
if err = status.CheckUnTerminatedPodStatusesByWorkloadName(target, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
span.SetStatus(codes.Error, "could not check pod statuses by workload names")
span.RecordError(err)
return stacktrace.Propagate(err, "could not check pod statuses by workload names")
}
}
Expand Down Expand Up @@ -184,17 +208,24 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
// Get the target pod details for the chaos execution
// if the target pod is not defined it will derive the random target pod list using pod affected percentage
if experimentsDetails.TargetPods == "" && chaosDetails.AppDetail == nil {
return cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "please provide one of the appLabel or TARGET_PODS"}
span.SetStatus(codes.Error, "please provide one of the appLabel or TARGET_PODS")
err := cerrors.Error{ErrorCode: cerrors.ErrorTypeTargetSelection, Reason: "please provide one of the appLabel or TARGET_PODS"}
span.RecordError(err)
return err
}
targetPodList, err := common.GetTargetPods(experimentsDetails.NodeLabel, experimentsDetails.TargetPods, experimentsDetails.PodsAffectedPerc, clients, chaosDetails)
if err != nil {
span.SetStatus(codes.Error, "could not get target pods")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get target pods")
}

// deriving the parent name of the target resources
for _, pod := range targetPodList.Items {
kind, parentName, err := workloads.GetPodOwnerTypeAndName(&pod, clients.DynamicClient)
if err != nil {
span.SetStatus(codes.Error, "could not get pod owner name and kind")
span.RecordError(err)
return stacktrace.Propagate(err, "could not get pod owner name and kind")
}
common.SetParentName(parentName, kind, pod.Namespace, chaosDetails)
Expand All @@ -221,13 +252,16 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
err = clients.KubeClient.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, v1.DeleteOptions{})
}
if err != nil {
span.SetStatus(codes.Error, "could not delete the target pod")
span.RecordError(err)
return cerrors.Error{ErrorCode: cerrors.ErrorTypeChaosInject, Target: fmt.Sprintf("{podName: %s, namespace: %s}", pod.Name, pod.Namespace), Reason: fmt.Sprintf("failed to delete the target pod: %s", err.Error())}
}
}

switch chaosDetails.Randomness {
case true:
if err := common.RandomInterval(experimentsDetails.ChaosInterval); err != nil {
span.SetStatus(codes.Error, "could not get random chaos interval")
return stacktrace.Propagate(err, "could not get random chaos interval")
}
default:
Expand All @@ -248,6 +282,8 @@ func injectChaosInParallelMode(ctx context.Context, experimentsDetails *experime
Namespace: parent.Namespace,
}
if err = status.CheckUnTerminatedPodStatusesByWorkloadName(target, experimentsDetails.Timeout, experimentsDetails.Delay, clients); err != nil {
span.SetStatus(codes.Error, "could not check pod statuses by workload names")
span.RecordError(err)
return stacktrace.Propagate(err, "could not check pod statuses by workload names")
}
}
Expand Down
22 changes: 22 additions & 0 deletions experiments/generic/pod-delete/experiment/pod-delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ import (
"github.com/litmuschaos/litmus-go/pkg/types"
"github.com/litmuschaos/litmus-go/pkg/utils/common"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
)

// PodDelete inject the pod-delete chaos
func PodDelete(ctx context.Context, clients clients.ClientSets) {
span := trace.SpanFromContext(ctx)

experimentsDetails := experimentTypes.ExperimentDetails{}
resultDetails := types.ResultDetails{}
eventsDetails := types.EventDetails{}
Expand All @@ -40,6 +44,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
// Get values from chaosengine. Bail out upon error, as we haven't entered exp business logic yet
if err := types.GetValuesFromChaosEngine(&chaosDetails, clients, &resultDetails); err != nil {
log.Errorf("Unable to initialize the probes, err: %v", err)
span.SetStatus(codes.Error, "Unable to initialize the probes")
span.RecordError(err)
return
}
}
Expand All @@ -49,13 +55,17 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
if err := result.ChaosResult(&chaosDetails, clients, &resultDetails, "SOT"); err != nil {
log.Errorf("Unable to create the chaosresult, err: %v", err)
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Unable to create the chaosresult")
span.RecordError(err)
return
}

// Set the chaos result uid
if err := result.SetResultUID(&resultDetails, clients, &chaosDetails); err != nil {
log.Errorf("Unable to set the result uid, err: %v", err)
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Unable to set the result uid")
span.RecordError(err)
return
}

Expand Down Expand Up @@ -85,6 +95,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
log.Errorf("failed to create %v event inside chaosengine", types.PreChaosCheck)
}
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Application status check failed")
span.RecordError(err)
return
}
}
Expand All @@ -104,6 +116,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
log.Errorf("failed to create %v event inside chaosengine", types.PreChaosCheck)
}
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Probe Failed")
span.RecordError(err)
return
}
msg = common.GetStatusMessage(chaosDetails.DefaultHealthCheck, "AUT: Running", "Successful")
Expand All @@ -117,6 +131,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
if err := litmusLIB.PreparePodDelete(ctx, &experimentsDetails, clients, &resultDetails, &eventsDetails, &chaosDetails); err != nil {
log.Errorf("Chaos injection failed, err: %v", err)
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Chaos injection failed")
span.RecordError(err)
return
}

Expand All @@ -132,6 +148,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
types.SetEngineEventAttributes(&eventsDetails, types.PostChaosCheck, "AUT: Not Running", "Warning", &chaosDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Application status check failed")
span.RecordError(err)
return
}
}
Expand All @@ -150,6 +168,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
log.Errorf("failed to create %v event inside chaosengine", types.PostChaosCheck)
}
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Probes Failed")
span.RecordError(err)
return
}
msg = common.GetStatusMessage(chaosDetails.DefaultHealthCheck, "AUT: Running", "Successful")
Expand All @@ -165,6 +185,8 @@ func PodDelete(ctx context.Context, clients clients.ClientSets) {
if err := result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT"); err != nil {
log.Errorf("Unable to update the chaosresult, err: %v", err)
result.RecordAfterFailure(&chaosDetails, &resultDetails, err, clients, &eventsDetails)
span.SetStatus(codes.Error, "Unable to update the chaosresult")
span.RecordError(err)
return
}

Expand Down
15 changes: 14 additions & 1 deletion pkg/probe/probe.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/palantir/stacktrace"
"github.com/sirupsen/logrus"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/codes"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand All @@ -32,6 +33,8 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
// get the probes details from the chaosengine
probes, err := getProbesFromChaosEngine(chaosDetails, clients)
if err != nil {
span.SetStatus(codes.Error, "getProbesFromChaosEngine failed")
span.RecordError(err)
return err
}

Expand All @@ -42,6 +45,8 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
switch strings.ToLower(probe.Mode) {
case "sot", "edge", "continuous":
if err := execute(probe, chaosDetails, clients, resultDetails, phase); err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("%s mode %s probe execute failed", probe.Mode, probe.Name))
span.RecordError(err)
return err
}
}
Expand All @@ -51,6 +56,8 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
for _, probe := range probes {
if strings.ToLower(probe.Mode) == "onchaos" {
if err := execute(probe, chaosDetails, clients, resultDetails, phase); err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("%s mode %s probe execute failed", probe.Mode, probe.Name))
span.RecordError(err)
return err
}
}
Expand All @@ -72,13 +79,19 @@ func RunProbes(ctx context.Context, chaosDetails *types.ChaosDetails, clients cl
}
}
if len(probeError) != 0 {
return cerrors.PreserveError{ErrString: fmt.Sprintf("[%s]", strings.Join(probeError, ","))}
errString := fmt.Sprintf("[%s]", strings.Join(probeError, ","))
span.SetStatus(codes.Error, errString)
err := cerrors.PreserveError{ErrString: errString}
span.RecordError(err)
return err
}
// executes the eot and edge modes
for _, probe := range probes {
switch strings.ToLower(probe.Mode) {
case "eot", "edge":
if err := execute(probe, chaosDetails, clients, resultDetails, phase); err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("%s mode %s probe execute failed", probe.Mode, probe.Name))
span.RecordError(err)
return err
}
}
Expand Down