From 4fba148652fa292cc991f6c4c6e8bec170d30ab5 Mon Sep 17 00:00:00 2001 From: Mark Date: Thu, 2 May 2024 13:44:28 -0400 Subject: [PATCH] feat(gcp+aws): Add last_scrape_time metric (#159) In order to better track freshness of data, this PR adds a few more operational metrics: - `cloudcost_exporter_collector_last_scrape_time` - `cloudcost_exporter_last_scrape_time` The intent of these is to export in unix time the last time a scrape was performed. This can be used to alert in prometheus when the last_scrape_time was say > 60m. This also implements in AWS the operational metrics that GCP implemented so that we have feature parity between the two. In the future it would make sense to generalize this to a common interface so that new providers do not need to implement the same metrics. - refs #5 + #105 --- pkg/aws/aws.go | 74 ++++++++++++++++++++++++++++++++++++++++-- pkg/aws/aws_test.go | 1 + pkg/google/gcp.go | 16 +++++++++ pkg/google/gcp_test.go | 32 ++++++++++++++---- 4 files changed, 115 insertions(+), 8 deletions(-) diff --git a/pkg/aws/aws.go b/pkg/aws/aws.go index 7e659708..aa9fb00b 100644 --- a/pkg/aws/aws.go +++ b/pkg/aws/aws.go @@ -29,12 +29,62 @@ type AWS struct { } var ( + providerLastScrapeErrorDesc = prometheus.NewDesc( + prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "last_scrape_error"), + "Was the last scrape an error. 1 indicates an error.", + []string{"provider"}, + nil, + ) collectorSuccessDesc = prometheus.NewDesc( prometheus.BuildFQName(cloudcost_exporter.ExporterName, subsystem, "collector_success"), "Was the last scrape of the AWS metrics successful.", []string{"collector"}, nil, ) + collectorLastScrapeErrorDesc = prometheus.NewDesc( + prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "last_scrape_error"), + "Was the last scrape an error. 1 indicates an error.", + []string{"provider", "collector"}, + nil, + ) + collectorDurationDesc = prometheus.NewDesc( + prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "last_scrape_duration_seconds"), + "Duration of the last scrape in seconds.", + []string{"provider", "collector"}, + nil, + ) + collectorScrapesTotalCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "scrapes_total"), + Help: "Total number of scrapes for a collector.", + }, + []string{"provider", "collector"}, + ) + collectorLastScrapeTime = prometheus.NewDesc( + prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "last_scrape_time"), + "Time of the last scrape.W", + []string{"provider", "collector"}, + nil, + ) + providerLastScrapeTime = prometheus.NewDesc( + prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "last_scrape_time"), + "Time of the last scrape.", + []string{"provider"}, + nil, + ) + providerLastScrapeDurationDesc = prometheus.NewDesc( + prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "last_scrape_duration_seconds"), + "Duration of the last scrape in seconds.", + []string{"provider"}, + nil, + ) + providerScrapesTotalCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "scrapes_total"), + Help: "Total number of scrapes.", + }, + []string{"provider"}, + ) ) var services = []string{"S3"} @@ -85,6 +135,9 @@ func New(config *Config) (*AWS, error) { func (a *AWS) RegisterCollectors(registry provider.Registry) error { log.Printf("Registering %d collectors for AWS", len(a.collectors)) + registry.MustRegister( + collectorScrapesTotalCounter, + ) for _, c := range a.collectors { if err := c.Register(registry); err != nil { return err @@ -94,6 +147,13 @@ func (a *AWS) RegisterCollectors(registry provider.Registry) error { } func (a *AWS) Describe(ch chan<- *prometheus.Desc) { + ch <- collectorLastScrapeErrorDesc + ch <- collectorDurationDesc + ch <- providerLastScrapeErrorDesc + ch <- providerLastScrapeDurationDesc + ch <- collectorLastScrapeTime + ch <- providerLastScrapeTime + ch <- collectorSuccessDesc for _, c := range a.collectors { if err := c.Describe(ch); err != nil { log.Printf("Error describing collector %s: %s", c.Name(), err) @@ -102,18 +162,28 @@ func (a *AWS) Describe(ch chan<- *prometheus.Desc) { } func (a *AWS) Collect(ch chan<- prometheus.Metric) { + start := time.Now() wg := &sync.WaitGroup{} wg.Add(len(a.collectors)) for _, c := range a.collectors { go func(c provider.Collector) { + now := time.Now() defer wg.Done() - collectorSuccess := 1.0 + collectorSuccess := 0.0 if err := c.Collect(ch); err != nil { - collectorSuccess = 0.0 + collectorSuccess = 1.0 log.Printf("Error collecting metrics from collector %s: %s", c.Name(), err) } + ch <- prometheus.MustNewConstMetric(collectorLastScrapeErrorDesc, prometheus.GaugeValue, collectorSuccess, subsystem, c.Name()) + ch <- prometheus.MustNewConstMetric(collectorDurationDesc, prometheus.GaugeValue, time.Since(now).Seconds(), subsystem, c.Name()) + ch <- prometheus.MustNewConstMetric(collectorLastScrapeTime, prometheus.GaugeValue, float64(time.Now().Unix()), subsystem, c.Name()) ch <- prometheus.MustNewConstMetric(collectorSuccessDesc, prometheus.GaugeValue, collectorSuccess, c.Name()) + collectorScrapesTotalCounter.WithLabelValues(subsystem, c.Name()).Inc() }(c) } wg.Wait() + ch <- prometheus.MustNewConstMetric(providerLastScrapeErrorDesc, prometheus.GaugeValue, 0.0, subsystem) + ch <- prometheus.MustNewConstMetric(providerLastScrapeDurationDesc, prometheus.GaugeValue, time.Since(start).Seconds(), subsystem) + ch <- prometheus.MustNewConstMetric(providerLastScrapeTime, prometheus.GaugeValue, float64(time.Now().Unix()), subsystem) + providerScrapesTotalCounter.WithLabelValues(subsystem).Inc() } diff --git a/pkg/aws/aws_test.go b/pkg/aws/aws_test.go index 23c95b01..91cea2e0 100644 --- a/pkg/aws/aws_test.go +++ b/pkg/aws/aws_test.go @@ -63,6 +63,7 @@ func Test_RegisterCollectors(t *testing.T) { t.Run(tc.name, func(t *testing.T) { ctrl := gomock.NewController(t) r := mock_provider.NewMockRegistry(ctrl) + r.EXPECT().MustRegister(gomock.Any()).AnyTimes() c := mock_provider.NewMockCollector(ctrl) if tc.register != nil { c.EXPECT().Register(r).DoAndReturn(tc.register).Times(tc.numCollectors) diff --git a/pkg/google/gcp.go b/pkg/google/gcp.go index 9844fd84..44c0435c 100644 --- a/pkg/google/gcp.go +++ b/pkg/google/gcp.go @@ -64,6 +64,18 @@ var ( }, []string{"provider", "collector"}, ) + collectorLastScrapeTime = prometheus.NewDesc( + prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "last_scrape_time"), + "Time of the last scrape.W", + []string{"provider", "collector"}, + nil, + ) + providerLastScrapeTime = prometheus.NewDesc( + prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "last_scrape_time"), + "Time of the last scrape.", + []string{"provider"}, + nil, + ) ) type GCP struct { @@ -163,6 +175,8 @@ func (g *GCP) Describe(ch chan<- *prometheus.Desc) { ch <- collectorDurationDesc ch <- providerLastScrapeErrorDesc ch <- providerLastScrapeDurationDesc + ch <- collectorLastScrapeTime + ch <- providerLastScrapeTime for _, c := range g.collectors { if err := c.Describe(ch); err != nil { log.Printf("Error describing collector %s: %s", c.Name(), err) @@ -187,6 +201,7 @@ func (g *GCP) Collect(ch chan<- prometheus.Metric) { log.Printf("Collector(%s) collect respose=%.2f", c.Name(), collectorSuccess) ch <- prometheus.MustNewConstMetric(collectorLastScrapeErrorDesc, prometheus.GaugeValue, collectorSuccess, subsystem, c.Name()) ch <- prometheus.MustNewConstMetric(collectorDurationDesc, prometheus.GaugeValue, time.Since(now).Seconds(), subsystem, c.Name()) + ch <- prometheus.MustNewConstMetric(collectorLastScrapeTime, prometheus.GaugeValue, float64(time.Now().Unix()), subsystem, c.Name()) collectorScrapesTotalCounter.WithLabelValues(subsystem, c.Name()).Inc() }(c) } @@ -194,5 +209,6 @@ func (g *GCP) Collect(ch chan<- prometheus.Metric) { // When can the error actually happen? Potentially if all the collectors fail? ch <- prometheus.MustNewConstMetric(providerLastScrapeErrorDesc, prometheus.GaugeValue, 0.0, subsystem) ch <- prometheus.MustNewConstMetric(providerLastScrapeDurationDesc, prometheus.GaugeValue, time.Since(start).Seconds(), subsystem) + ch <- prometheus.MustNewConstMetric(providerLastScrapeTime, prometheus.GaugeValue, float64(time.Now().Unix()), subsystem) providerScrapesTotalCounter.WithLabelValues(subsystem).Inc() } diff --git a/pkg/google/gcp_test.go b/pkg/google/gcp_test.go index a9195981..0d6213d0 100644 --- a/pkg/google/gcp_test.go +++ b/pkg/google/gcp_test.go @@ -88,6 +88,12 @@ func TestGCP_CollectMetrics(t *testing.T) { Value: 0, MetricType: prometheus.GaugeValue, }, + { + FqName: "cloudcost_exporter_collector_last_scrape_time", + Labels: utils.LabelMap{"provider": "gcp", "collector": "test"}, + Value: 0, + MetricType: prometheus.GaugeValue, + }, { FqName: "cloudcost_exporter_last_scrape_error", Labels: utils.LabelMap{"provider": "gcp"}, @@ -117,6 +123,12 @@ func TestGCP_CollectMetrics(t *testing.T) { Value: 0, MetricType: prometheus.GaugeValue, }, { + FqName: "cloudcost_exporter_collector_last_scrape_time", + Labels: utils.LabelMap{"provider": "gcp", "collector": "test"}, + Value: 0, + MetricType: prometheus.GaugeValue, + }, + { FqName: "cloudcost_exporter_collector_last_scrape_error", Labels: utils.LabelMap{"provider": "gcp", "collector": "test"}, Value: 0, @@ -128,14 +140,16 @@ func TestGCP_CollectMetrics(t *testing.T) { Value: 0, MetricType: prometheus.GaugeValue, }, + { - FqName: "cloudcost_exporter_last_scrape_error", - Labels: utils.LabelMap{"provider": "gcp"}, + FqName: "cloudcost_exporter_collector_last_scrape_time", + Labels: utils.LabelMap{"provider": "gcp", "collector": "test"}, Value: 0, MetricType: prometheus.GaugeValue, }, + { - FqName: "cloudcost_exporter_last_scrape_duration_seconds", + FqName: "cloudcost_exporter_last_scrape_error", Labels: utils.LabelMap{"provider": "gcp"}, Value: 0, MetricType: prometheus.GaugeValue, @@ -176,13 +190,19 @@ func TestGCP_CollectMetrics(t *testing.T) { wg.Done() wg.Wait() + ignoredMetricSuffix := []string{"duration_seconds", "last_scrape_time"} + // I don't love using a named loop, but this allows the inner loop to properly continue if the condition has been met. + metricsLoop: for _, expectedMetric := range tt.expectedMetrics { metric := utils.ReadMetrics(<-ch) // We don't care about the value for the scrape durations, just that it exists and is returned in the order we expect. - if strings.Contains(metric.FqName, "duration_seconds") { - require.Equal(t, expectedMetric.FqName, metric.FqName) - continue + for _, suffix := range ignoredMetricSuffix { + if strings.Contains(metric.FqName, suffix) { + require.Equal(t, expectedMetric.FqName, metric.FqName) + continue metricsLoop + } } + require.Equal(t, expectedMetric, metric) }