Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/defaultmonitortests/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ import (
"github.com/openshift/origin/pkg/monitortests/testframework/highcpumetriccollector"
"github.com/openshift/origin/pkg/monitortests/testframework/highcputestanalyzer"

"github.com/openshift/origin/pkg/monitortests/testframework/intervaldurationsum"
"github.com/openshift/origin/pkg/monitortests/testframework/intervalserializer"
"github.com/openshift/origin/pkg/monitortests/testframework/knownimagechecker"
"github.com/openshift/origin/pkg/monitortests/testframework/legacytestframeworkmonitortests"
Expand Down Expand Up @@ -132,6 +133,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI

monitorTestRegistry.AddMonitorTestOrDie("alert-summary-serializer", "Test Framework", alertanalyzer.NewAlertSummarySerializer())
monitorTestRegistry.AddMonitorTestOrDie("metrics-endpoints-down", "Test Framework", metricsendpointdown.NewMetricsEndpointDown())
monitorTestRegistry.AddMonitorTestOrDie("interval-duration-sum", "Test Framework", intervaldurationsum.NewIntervalDurationSum())
monitorTestRegistry.AddMonitorTestOrDie("external-service-availability", "Test Framework", disruptionexternalservicemonitoring.NewAvailabilityInvariant())
monitorTestRegistry.AddMonitorTestOrDie("external-gcp-cloud-service-availability", "Test Framework", disruptionexternalgcpcloudservicemonitoring.NewCloudAvailabilityInvariant())
monitorTestRegistry.AddMonitorTestOrDie("external-aws-cloud-service-availability", "Test Framework", disruptionexternalawscloudservicemonitoring.NewCloudAvailabilityInvariant())
Expand Down
111 changes: 111 additions & 0 deletions pkg/monitortests/testframework/intervaldurationsum/monitortest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package intervaldurationsum

import (
"context"
"fmt"
"path/filepath"
"time"

"github.com/openshift/origin/pkg/dataloader"
"github.com/openshift/origin/pkg/monitortestframework"
"github.com/sirupsen/logrus"

"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
"k8s.io/client-go/rest"
)

// intervalDurationSum is a monitor test that sums the total duration of intervals
// matching specific sources and writes the results to an autodl file.
//
// The generated autodl file will have the following schema:
// - IntervalSource (string): The source type of the intervals
// - TotalDurationSeconds (float64): Sum of all interval durations in seconds for that source
//
// The autodl file will be named: interval_duration_sum{timeSuffix}-autodl.json
type intervalDurationSum struct {
adminRESTConfig *rest.Config
}

// NewIntervalDurationSum creates a monitor test that sums the total duration of intervals
// for specific sources and writes the results to an autodl file.
func NewIntervalDurationSum() monitortestframework.MonitorTest {
return &intervalDurationSum{}
}

func (w *intervalDurationSum) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
return nil
}

func (w *intervalDurationSum) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
w.adminRESTConfig = adminRESTConfig
return nil
}

func (w *intervalDurationSum) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
return nil, nil, nil
}

func (w *intervalDurationSum) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
return nil, nil
}

func (w *intervalDurationSum) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
return nil, nil
}

func (w *intervalDurationSum) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
logger := logrus.WithField("MonitorTest", "IntervalDurationSum")

// Define the interval sources to track
sourcesToTrack := []monitorapi.IntervalSource{
monitorapi.SourceMetricsEndpointDown,
monitorapi.SourceCPUMonitor,
}

// Calculate total duration for each source
rows := []map[string]string{}
for _, source := range sourcesToTrack {
matchingIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Source == source
})

var totalDurationSeconds float64
for _, interval := range matchingIntervals {
duration := interval.To.Sub(interval.From).Seconds()
totalDurationSeconds += duration
}

logger.Infof("Total duration for source %s: %.2f seconds across %d intervals", source, totalDurationSeconds, len(matchingIntervals))

rows = append(rows, map[string]string{
"IntervalSource": string(source),
"TotalDurationSeconds": fmt.Sprintf("%.2f", totalDurationSeconds),
})
}

// Create autodl artifact with total durations per source
dataFile := dataloader.DataFile{
TableName: "interval_duration_sum",
Schema: map[string]dataloader.DataType{
"IntervalSource": dataloader.DataTypeString,
"TotalDurationSeconds": dataloader.DataTypeFloat64,
},
Rows: rows,
}

// Create the file name using the autodl suffix
fileName := filepath.Join(storageDir, fmt.Sprintf("interval-duration-sum%s-%s", timeSuffix, dataloader.AutoDataLoaderSuffix))

// Write the data file
err := dataloader.WriteDataFile(fileName, dataFile)
if err != nil {
logger.WithError(err).Warnf("unable to write data file: %s", fileName)
}

return nil
}

func (w *intervalDurationSum) Cleanup(ctx context.Context) error {
return nil
}
67 changes: 42 additions & 25 deletions pkg/monitortests/testframework/metricsendpointdown/monitortest.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,64 +35,81 @@ func (w *metricsEndpointDown) StartCollection(ctx context.Context, adminRESTConf
}

func (w *metricsEndpointDown) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
intervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning)
return intervals, nil, err
// Don't return intervals here - we'll filter them in ConstructComputedIntervals
return nil, nil, nil
}

func (*metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
return nil, nil
}

func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
failures := []string{}
func (w *metricsEndpointDown) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
logger := logrus.WithField("MonitorTest", "MetricsEndpointDown")
metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Source == monitorapi.SourceMetricsEndpointDown
})
logger.Infof("found %d metrics endpoint down intervals", len(metricsEndpointDownIntervals))

// We know these endpoints go down both during node update, and obviously during reboot, ignore overlap
// with either:
nodeUpdateIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
// Query Prometheus for metrics endpoint down intervals
metricsEndpointDownIntervals, err := buildIntervalsForMetricsEndpointsDown(ctx, w.adminRESTConfig, beginning)
if err != nil {
return nil, err
}
logger.Infof("found %d metrics endpoint down intervals from Prometheus", len(metricsEndpointDownIntervals))

// Filter for node update and reboot intervals
nodeUpdateIntervals := startingIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return (eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Update") ||
(eventInterval.Source == monitorapi.SourceNodeState && eventInterval.Message.Annotations["phase"] == "Reboot")
})
logger.Infof("found %d node update intervals", len(nodeUpdateIntervals))
logger.Infof("found %d node update/reboot intervals", len(nodeUpdateIntervals))

// Filter out metrics endpoint down intervals that overlap with node updates/reboots
filteredIntervals := monitorapi.Intervals{}
for _, downInterval := range metricsEndpointDownIntervals {
logger.Infof("checking metrics down interval: %s", downInterval)
restartsForNodeIntervals := nodeUpdateIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Locator.Keys[monitorapi.LocatorNodeKey] == downInterval.Locator.Keys[monitorapi.LocatorNodeKey]
})
overlapIntervals := utility.FindOverlap(restartsForNodeIntervals, downInterval)
if len(overlapIntervals) == 0 {
failures = append(failures, downInterval.String())
logger.Info("found no overlap with a node update")
// No overlap with node update/reboot - keep this interval
filteredIntervals = append(filteredIntervals, downInterval)
} else {
logger.Infof("found overlap with a node update: %s", overlapIntervals[0])
logger.Infof("filtering out metrics endpoint down interval due to overlap with node update/reboot: %s", downInterval)
}
}
logger.Infof("returning %d filtered metrics endpoint down intervals (filtered out %d that overlapped with node updates/reboots)",
len(filteredIntervals), len(metricsEndpointDownIntervals)-len(filteredIntervals))

return filteredIntervals, nil
}

func (*metricsEndpointDown) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
logger := logrus.WithField("MonitorTest", "MetricsEndpointDown")

// Get metrics endpoint down intervals - these have already been filtered in ConstructComputedIntervals
// to exclude overlaps with node updates/reboots
metricsEndpointDownIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Source == monitorapi.SourceMetricsEndpointDown
})
logger.Infof("evaluating %d metrics endpoint down intervals (already filtered)", len(metricsEndpointDownIntervals))

junits := []*junitapi.JUnitTestCase{}
if len(failures) > 0 {
if len(metricsEndpointDownIntervals) > 0 {
failures := []string{}
for _, downInterval := range metricsEndpointDownIntervals {
failures = append(failures, downInterval.String())
}
testOutput := fmt.Sprintf("found prometheus reporting metrics endpoints down outside of a node update: \n %s",
strings.Join(failures, "\n "))
// This metrics down interval did not overlap with any update for the corresponding node, fail/flake a junit:
// Limit to kubelet service, all we're querying right now?
junits = append(junits, &junitapi.JUnitTestCase{
Name: testName,
FailureOutput: &junitapi.FailureOutput{
Output: testOutput,
},
})
}
// Add a success so this is marked as a flake at worst, no idea what this will unleash in the wild.
// Add a success so this is marked as a flake at worst
junits = append(junits, &junitapi.JUnitTestCase{
Name: testName,
})
return junits, nil
}

func (*metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
func (w *metricsEndpointDown) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
// No longer writing autodl files here - intervaldurationsum monitor test handles this
return nil
}

Expand Down