diff --git a/pkg/defaultmonitortests/types.go b/pkg/defaultmonitortests/types.go index 0ee34a165930..a9fcd6796fd2 100644 --- a/pkg/defaultmonitortests/types.go +++ b/pkg/defaultmonitortests/types.go @@ -54,6 +54,7 @@ import ( "github.com/openshift/origin/pkg/monitortests/testframework/highcpumetriccollector" "github.com/openshift/origin/pkg/monitortests/testframework/highcputestanalyzer" + "github.com/openshift/origin/pkg/monitortests/testframework/intervaldurationsum" "github.com/openshift/origin/pkg/monitortests/testframework/intervalserializer" "github.com/openshift/origin/pkg/monitortests/testframework/knownimagechecker" "github.com/openshift/origin/pkg/monitortests/testframework/legacytestframeworkmonitortests" @@ -132,6 +133,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI monitorTestRegistry.AddMonitorTestOrDie("alert-summary-serializer", "Test Framework", alertanalyzer.NewAlertSummarySerializer()) monitorTestRegistry.AddMonitorTestOrDie("metrics-endpoints-down", "Test Framework", metricsendpointdown.NewMetricsEndpointDown()) + monitorTestRegistry.AddMonitorTestOrDie("interval-duration-sum", "Test Framework", intervaldurationsum.NewIntervalDurationSum()) monitorTestRegistry.AddMonitorTestOrDie("external-service-availability", "Test Framework", disruptionexternalservicemonitoring.NewAvailabilityInvariant()) monitorTestRegistry.AddMonitorTestOrDie("external-gcp-cloud-service-availability", "Test Framework", disruptionexternalgcpcloudservicemonitoring.NewCloudAvailabilityInvariant()) monitorTestRegistry.AddMonitorTestOrDie("external-aws-cloud-service-availability", "Test Framework", disruptionexternalawscloudservicemonitoring.NewCloudAvailabilityInvariant()) diff --git a/pkg/monitortests/testframework/intervaldurationsum/monitortest.go b/pkg/monitortests/testframework/intervaldurationsum/monitortest.go new file mode 100644 index 000000000000..4cc7181ac167 --- /dev/null +++ b/pkg/monitortests/testframework/intervaldurationsum/monitortest.go @@ -0,0 +1,111 @@ +package intervaldurationsum + +import ( + "context" + "fmt" + "path/filepath" + "time" + + "github.com/openshift/origin/pkg/dataloader" + "github.com/openshift/origin/pkg/monitortestframework" + "github.com/sirupsen/logrus" + + "github.com/openshift/origin/pkg/monitor/monitorapi" + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" + "k8s.io/client-go/rest" +) + +// intervalDurationSum is a monitor test that sums the total duration of intervals +// matching specific sources and writes the results to an autodl file. +// +// The generated autodl file will have the following schema: +// - IntervalSource (string): The source type of the intervals +// - TotalDurationSeconds (float64): Sum of all interval durations in seconds for that source +// +// The autodl file will be named: interval_duration_sum{timeSuffix}-autodl.json +type intervalDurationSum struct { + adminRESTConfig *rest.Config +} + +// NewIntervalDurationSum creates a monitor test that sums the total duration of intervals +// for specific sources and writes the results to an autodl file. +func NewIntervalDurationSum() monitortestframework.MonitorTest { + return &intervalDurationSum{} +} + +func (w *intervalDurationSum) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error { + return nil +} + +func (w *intervalDurationSum) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error { + w.adminRESTConfig = adminRESTConfig + return nil +} + +func (w *intervalDurationSum) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) { + return nil, nil, nil +} + +func (w *intervalDurationSum) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { + return nil, nil +} + +func (w *intervalDurationSum) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) { + return nil, nil +} + +func (w *intervalDurationSum) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error { + logger := logrus.WithField("MonitorTest", "IntervalDurationSum") + + // Define the interval sources to track + sourcesToTrack := []monitorapi.IntervalSource{ + monitorapi.SourceMetricsEndpointDown, + monitorapi.SourceCPUMonitor, + } + + // Calculate total duration for each source + rows := []map[string]string{} + for _, source := range sourcesToTrack { + matchingIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { + return eventInterval.Source == source + }) + + var totalDurationSeconds float64 + for _, interval := range matchingIntervals { + duration := interval.To.Sub(interval.From).Seconds() + totalDurationSeconds += duration + } + + logger.Infof("Total duration for source %s: %.2f seconds across %d intervals", source, totalDurationSeconds, len(matchingIntervals)) + + rows = append(rows, map[string]string{ + "IntervalSource": string(source), + "TotalDurationSeconds": fmt.Sprintf("%.2f", totalDurationSeconds), + }) + } + + // Create autodl artifact with total durations per source + dataFile := dataloader.DataFile{ + TableName: "interval_duration_sum", + Schema: map[string]dataloader.DataType{ + "IntervalSource": dataloader.DataTypeString, + "TotalDurationSeconds": dataloader.DataTypeFloat64, + }, + Rows: rows, + } + + // Create the file name using the autodl suffix + fileName := filepath.Join(storageDir, fmt.Sprintf("interval-duration-sum%s-%s", timeSuffix, dataloader.AutoDataLoaderSuffix)) + + // Write the data file + err := dataloader.WriteDataFile(fileName, dataFile) + if err != nil { + logger.WithError(err).Warnf("unable to write data file: %s", fileName) + } + + return nil +} + +func (w *intervalDurationSum) Cleanup(ctx context.Context) error { + return nil +} diff --git a/pkg/monitortests/testframework/metricsendpointdown/monitortest.go b/pkg/monitortests/testframework/metricsendpointdown/monitortest.go index 5ed1c90c060a..3bb920af4b31 100644 --- a/pkg/monitortests/testframework/metricsendpointdown/monitortest.go +++ b/pkg/monitortests/testframework/metricsendpointdown/monitortest.go @@ -35,49 +35,65 @@ func (w *metricsEndpointDown) StartCollection(ctx context.Context, adminRESTConf } func (w *metricsEndpointDown) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) { - intervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning) - return intervals, nil, err + // Don't return intervals here - we'll filter them in ConstructComputedIntervals + return nil, nil, nil } -func (*metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { - return nil, nil -} - -func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) { - failures := []string{} +func (w *metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { logger := logrus.WithField("MonitorTest", "MetricsEndpointDown") - metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { - return eventInterval.Source == monitorapi.SourceMetricsEndpointDown - }) - logger.Infof("found %d metrics endpoint down intervals", len(metricsEndpointDownIntervals)) - // We know these endpoints go down both during node update, and obviously during reboot, ignore overlap - // with either: - nodeUpdateIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { + // Query Prometheus for metrics endpoint down intervals + metricsEndpointDownIntervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning) + if err != nil { + return nil, err + } + logger.Infof("found %d metrics endpoint down intervals from Prometheus", len(metricsEndpointDownIntervals)) + + // Filter for node update and reboot intervals + nodeUpdateIntervals := startingIntervals.Filter(func(eventInterval monitorapi.Interval) bool { return (eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Update") || (eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Reboot") }) - logger.Infof("found %d node update intervals", len(nodeUpdateIntervals)) + logger.Infof("found %d node update/reboot intervals", len(nodeUpdateIntervals)) + // Filter out metrics endpoint down intervals that overlap with node updates/reboots + filteredIntervals := monitorapi.Intervals{} for _, downInterval := range metricsEndpointDownIntervals { - logger.Infof("checking metrics down interval: %s", downInterval) restartsForNodeIntervals := nodeUpdateIntervals.Filter(func(eventInterval monitorapi.Interval) bool { return eventInterval.Locator.Keys[monitorapi.LocatorNodeKey] == downInterval.Locator.Keys[monitorapi.LocatorNodeKey] }) overlapIntervals := utility.FindOverlap(restartsForNodeIntervals, downInterval) if len(overlapIntervals) == 0 { - failures = append(failures, downInterval.String()) - logger.Info("found no overlap with a node update") + // No overlap with node update/reboot - keep this interval + filteredIntervals = append(filteredIntervals, downInterval) } else { - logger.Infof("found overlap with a node update: %s", overlapIntervals[0]) + logger.Infof("filtering out metrics endpoint down interval due to overlap with node update/reboot: %s", downInterval) } } + logger.Infof("returning %d filtered metrics endpoint down intervals (filtered out %d that overlapped with node updates/reboots)", + len(filteredIntervals), len(metricsEndpointDownIntervals)-len(filteredIntervals)) + + return filteredIntervals, nil +} + +func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) { + logger := logrus.WithField("MonitorTest", "MetricsEndpointDown") + + // Get metrics endpoint down intervals - these have already been filtered in ConstructComputedIntervals + // to exclude overlaps with node updates/reboots + metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { + return eventInterval.Source == monitorapi.SourceMetricsEndpointDown + }) + logger.Infof("evaluating %d metrics endpoint down intervals (already filtered)", len(metricsEndpointDownIntervals)) + junits := []*junitapi.JUnitTestCase{} - if len(failures) > 0 { + if len(metricsEndpointDownIntervals) > 0 { + failures := []string{} + for _, downInterval := range metricsEndpointDownIntervals { + failures = append(failures, downInterval.String()) + } testOutput := fmt.Sprintf("found prometheus reporting metrics endpoints down outside of a node update: \n %s", strings.Join(failures, "\n ")) - // This metrics down interval did not overlap with any update for the corresponding node, fail/flake a junit: - // Limit to kubelet service, all we're querying right now? junits = append(junits, &junitapi.JUnitTestCase{ Name: testName, FailureOutput: &junitapi.FailureOutput{ @@ -85,14 +101,15 @@ func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Co }, }) } - // Add a success so this is marked as a flake at worst, no idea what this will unleash in the wild. + // Add a success so this is marked as a flake at worst junits = append(junits, &junitapi.JUnitTestCase{ Name: testName, }) return junits, nil } -func (*metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error { +func (w *metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error { + // No longer writing autodl files here - intervaldurationsum monitor test handles this return nil }