From c3bdb3ec69701b20b5c5bd74633045c8183edd92 Mon Sep 17 00:00:00 2001
From: Devan Goodwin <dgoodwin@redhat.com>
Date: Wed, 10 Dec 2025 10:06:07 -0400
Subject: [PATCH] Track the total kubelet metrics outage durations with autodl
 framework

Only generate metrics down intervals if they do not overlap with node
reboots or updates.

Sum the total time we were in metrics endpoint down on any node with a
new generic monitortest for this purpose. Also sum high cpu intervals.

This will allow us to track if we're making things better with changes
and compare to past releases.
---
 pkg/defaultmonitortests/types.go              |   2 +
 .../intervaldurationsum/monitortest.go        | 111 ++++++++++++++++++
 .../metricsendpointdown/monitortest.go        |  67 +++++++----
 3 files changed, 155 insertions(+), 25 deletions(-)
 create mode 100644 pkg/monitortests/testframework/intervaldurationsum/monitortest.go

diff --git a/pkg/defaultmonitortests/types.go b/pkg/defaultmonitortests/types.go
index 0ee34a165930..a9fcd6796fd2 100644
--- a/pkg/defaultmonitortests/types.go
+++ b/pkg/defaultmonitortests/types.go
@@ -54,6 +54,7 @@ import (
 	"github.com/openshift/origin/pkg/monitortests/testframework/highcpumetriccollector"
 	"github.com/openshift/origin/pkg/monitortests/testframework/highcputestanalyzer"
 
+	"github.com/openshift/origin/pkg/monitortests/testframework/intervaldurationsum"
 	"github.com/openshift/origin/pkg/monitortests/testframework/intervalserializer"
 	"github.com/openshift/origin/pkg/monitortests/testframework/knownimagechecker"
 	"github.com/openshift/origin/pkg/monitortests/testframework/legacytestframeworkmonitortests"
@@ -132,6 +133,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI
 
 	monitorTestRegistry.AddMonitorTestOrDie("alert-summary-serializer", "Test Framework", alertanalyzer.NewAlertSummarySerializer())
 	monitorTestRegistry.AddMonitorTestOrDie("metrics-endpoints-down", "Test Framework", metricsendpointdown.NewMetricsEndpointDown())
+	monitorTestRegistry.AddMonitorTestOrDie("interval-duration-sum", "Test Framework", intervaldurationsum.NewIntervalDurationSum())
 	monitorTestRegistry.AddMonitorTestOrDie("external-service-availability", "Test Framework", disruptionexternalservicemonitoring.NewAvailabilityInvariant())
 	monitorTestRegistry.AddMonitorTestOrDie("external-gcp-cloud-service-availability", "Test Framework", disruptionexternalgcpcloudservicemonitoring.NewCloudAvailabilityInvariant())
 	monitorTestRegistry.AddMonitorTestOrDie("external-aws-cloud-service-availability", "Test Framework", disruptionexternalawscloudservicemonitoring.NewCloudAvailabilityInvariant())
diff --git a/pkg/monitortests/testframework/intervaldurationsum/monitortest.go b/pkg/monitortests/testframework/intervaldurationsum/monitortest.go
new file mode 100644
index 000000000000..4cc7181ac167
--- /dev/null
+++ b/pkg/monitortests/testframework/intervaldurationsum/monitortest.go
@@ -0,0 +1,111 @@
+package intervaldurationsum
+
+import (
+	"context"
+	"fmt"
+	"path/filepath"
+	"time"
+
+	"github.com/openshift/origin/pkg/dataloader"
+	"github.com/openshift/origin/pkg/monitortestframework"
+	"github.com/sirupsen/logrus"
+
+	"github.com/openshift/origin/pkg/monitor/monitorapi"
+	"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
+	"k8s.io/client-go/rest"
+)
+
+// intervalDurationSum is a monitor test that sums the total duration of intervals
+// matching specific sources and writes the results to an autodl file.
+//
+// The generated autodl file will have the following schema:
+//   - IntervalSource (string): The source type of the intervals
+//   - TotalDurationSeconds (float64): Sum of all interval durations in seconds for that source
+//
+// The autodl file will be named: interval_duration_sum{timeSuffix}-autodl.json
+type intervalDurationSum struct {
+	adminRESTConfig *rest.Config
+}
+
+// NewIntervalDurationSum creates a monitor test that sums the total duration of intervals
+// for specific sources and writes the results to an autodl file.
+func NewIntervalDurationSum() monitortestframework.MonitorTest {
+	return &intervalDurationSum{}
+}
+
+func (w *intervalDurationSum) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
+	return nil
+}
+
+func (w *intervalDurationSum) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
+	w.adminRESTConfig = adminRESTConfig
+	return nil
+}
+
+func (w *intervalDurationSum) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
+	return nil, nil, nil
+}
+
+func (w *intervalDurationSum) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
+	return nil, nil
+}
+
+func (w *intervalDurationSum) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
+	return nil, nil
+}
+
+func (w *intervalDurationSum) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
+	logger := logrus.WithField("MonitorTest", "IntervalDurationSum")
+
+	// Define the interval sources to track
+	sourcesToTrack := []monitorapi.IntervalSource{
+		monitorapi.SourceMetricsEndpointDown,
+		monitorapi.SourceCPUMonitor,
+	}
+
+	// Calculate total duration for each source
+	rows := []map[string]string{}
+	for _, source := range sourcesToTrack {
+		matchingIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
+			return eventInterval.Source == source
+		})
+
+		var totalDurationSeconds float64
+		for _, interval := range matchingIntervals {
+			duration := interval.To.Sub(interval.From).Seconds()
+			totalDurationSeconds += duration
+		}
+
+		logger.Infof("Total duration for source %s: %.2f seconds across %d intervals", source, totalDurationSeconds, len(matchingIntervals))
+
+		rows = append(rows, map[string]string{
+			"IntervalSource":       string(source),
+			"TotalDurationSeconds": fmt.Sprintf("%.2f", totalDurationSeconds),
+		})
+	}
+
+	// Create autodl artifact with total durations per source
+	dataFile := dataloader.DataFile{
+		TableName: "interval_duration_sum",
+		Schema: map[string]dataloader.DataType{
+			"IntervalSource":       dataloader.DataTypeString,
+			"TotalDurationSeconds": dataloader.DataTypeFloat64,
+		},
+		Rows: rows,
+	}
+
+	// Create the file name using the autodl suffix
+	fileName := filepath.Join(storageDir, fmt.Sprintf("interval-duration-sum%s-%s", timeSuffix, dataloader.AutoDataLoaderSuffix))
+
+	// Write the data file
+	err := dataloader.WriteDataFile(fileName, dataFile)
+	if err != nil {
+		logger.WithError(err).Warnf("unable to write data file: %s", fileName)
+	}
+
+	return nil
+}
+
+func (w *intervalDurationSum) Cleanup(ctx context.Context) error {
+	return nil
+}
diff --git a/pkg/monitortests/testframework/metricsendpointdown/monitortest.go b/pkg/monitortests/testframework/metricsendpointdown/monitortest.go
index 5ed1c90c060a..3bb920af4b31 100644
--- a/pkg/monitortests/testframework/metricsendpointdown/monitortest.go
+++ b/pkg/monitortests/testframework/metricsendpointdown/monitortest.go
@@ -35,49 +35,65 @@ func (w *metricsEndpointDown) StartCollection(ctx context.Context, adminRESTConf
 }
 
 func (w *metricsEndpointDown) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
-	intervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning)
-	return intervals, nil, err
+	// Don't return intervals here - we'll filter them in ConstructComputedIntervals
+	return nil, nil, nil
 }
 
-func (*metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
-	return nil, nil
-}
-
-func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
-	failures := []string{}
+func (w *metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
 	logger := logrus.WithField("MonitorTest", "MetricsEndpointDown")
-	metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
-		return eventInterval.Source == monitorapi.SourceMetricsEndpointDown
-	})
-	logger.Infof("found %d metrics endpoint down intervals", len(metricsEndpointDownIntervals))
 
-	// We know these endpoints go down both during node update, and obviously during reboot, ignore overlap
-	// with either:
-	nodeUpdateIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
+	// Query Prometheus for metrics endpoint down intervals
+	metricsEndpointDownIntervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning)
+	if err != nil {
+		return nil, err
+	}
+	logger.Infof("found %d metrics endpoint down intervals from Prometheus", len(metricsEndpointDownIntervals))
+
+	// Filter for node update and reboot intervals
+	nodeUpdateIntervals := startingIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
 		return (eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Update") ||
 			(eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Reboot")
 	})
-	logger.Infof("found %d node update intervals", len(nodeUpdateIntervals))
+	logger.Infof("found %d node update/reboot intervals", len(nodeUpdateIntervals))
 
+	// Filter out metrics endpoint down intervals that overlap with node updates/reboots
+	filteredIntervals := monitorapi.Intervals{}
 	for _, downInterval := range metricsEndpointDownIntervals {
-		logger.Infof("checking metrics down interval: %s", downInterval)
 		restartsForNodeIntervals := nodeUpdateIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
 			return eventInterval.Locator.Keys[monitorapi.LocatorNodeKey] == downInterval.Locator.Keys[monitorapi.LocatorNodeKey]
 		})
 		overlapIntervals := utility.FindOverlap(restartsForNodeIntervals, downInterval)
 		if len(overlapIntervals) == 0 {
-			failures = append(failures, downInterval.String())
-			logger.Info("found no overlap with a node update")
+			// No overlap with node update/reboot - keep this interval
+			filteredIntervals = append(filteredIntervals, downInterval)
 		} else {
-			logger.Infof("found overlap with a node update: %s", overlapIntervals[0])
+			logger.Infof("filtering out metrics endpoint down interval due to overlap with node update/reboot: %s", downInterval)
 		}
 	}
+	logger.Infof("returning %d filtered metrics endpoint down intervals (filtered out %d that overlapped with node updates/reboots)",
+		len(filteredIntervals), len(metricsEndpointDownIntervals)-len(filteredIntervals))
+
+	return filteredIntervals, nil
+}
+
+func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
+	logger := logrus.WithField("MonitorTest", "MetricsEndpointDown")
+
+	// Get metrics endpoint down intervals - these have already been filtered in ConstructComputedIntervals
+	// to exclude overlaps with node updates/reboots
+	metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
+		return eventInterval.Source == monitorapi.SourceMetricsEndpointDown
+	})
+	logger.Infof("evaluating %d metrics endpoint down intervals (already filtered)", len(metricsEndpointDownIntervals))
+
 	junits := []*junitapi.JUnitTestCase{}
-	if len(failures) > 0 {
+	if len(metricsEndpointDownIntervals) > 0 {
+		failures := []string{}
+		for _, downInterval := range metricsEndpointDownIntervals {
+			failures = append(failures, downInterval.String())
+		}
 		testOutput := fmt.Sprintf("found prometheus reporting metrics endpoints down outside of a node update: \n  %s",
 			strings.Join(failures, "\n  "))
-		// This metrics down interval did not overlap with any update for the corresponding node, fail/flake a junit:
-		// Limit to kubelet service, all we're querying right now?
 		junits = append(junits, &junitapi.JUnitTestCase{
 			Name: testName,
 			FailureOutput: &junitapi.FailureOutput{
@@ -85,14 +101,15 @@ func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Co
 			},
 		})
 	}
-	// Add a success so this is marked as a flake at worst, no idea what this will unleash in the wild.
+	// Add a success so this is marked as a flake at worst
 	junits = append(junits, &junitapi.JUnitTestCase{
 		Name: testName,
 	})
 	return junits, nil
 }
 
-func (*metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
+func (w *metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
+	// No longer writing autodl files here - intervaldurationsum monitor test handles this
 	return nil
 }