From 6de356728a6ced2342edee05531b5dfe629798b6 Mon Sep 17 00:00:00 2001 From: Jonathan Bowe Date: Thu, 29 May 2025 07:53:56 -0400 Subject: [PATCH 1/3] Feat: Improve Error Handling for Server.Scrape (#1158) * Update querySettings Error Return in Scrape If there are errors querying namespace mappings, the potential error from querySettings is obscured. Adding an immediate return if there are errors retreiving settings. Signed-off-by: Jonathan Bowe * Improve Verbosity of queryNamespaceMappings Errors Previously if any errors were encountered by queryNamespaceMappings, only a count of those errors was returned - making debugging those errors harder than it needs to be. I'm changing this to immediately return nil if no errors are encountered, and otherwise an error will be formatted with each of the namespaces and what the error was for that namespace. Signed-off-by: Jonathan Bowe * Simplify Error Message Co-authored-by: Ben Kochie Signed-off-by: Jonathan Bowe --------- Signed-off-by: Jonathan Bowe Signed-off-by: Jonathan Bowe Co-authored-by: Ben Kochie Signed-off-by: Peter Nuttall --- cmd/postgres_exporter/server.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cmd/postgres_exporter/server.go b/cmd/postgres_exporter/server.go index bd4e76e10..3d2ecde91 100644 --- a/cmd/postgres_exporter/server.go +++ b/cmd/postgres_exporter/server.go @@ -119,12 +119,17 @@ func (s *Server) Scrape(ch chan<- prometheus.Metric, disableSettingsMetrics bool if !disableSettingsMetrics && s.master { if err = querySettings(ch, s); err != nil { err = fmt.Errorf("error retrieving settings: %s", err) + return err } } errMap := queryNamespaceMappings(ch, s) - if len(errMap) > 0 { - err = fmt.Errorf("queryNamespaceMappings returned %d errors", len(errMap)) + if len(errMap) == 0 { + return nil + } + err = fmt.Errorf("queryNamespaceMappings errors encountered") + for namespace, errStr := range errMap { + err = fmt.Errorf("%s, namespace: %s error: %s", err, namespace, errStr) } return err From 7b9d0b0f5f44d3b84e9e3299fdf01ae46378917b Mon Sep 17 00:00:00 2001 From: Peter Nuttall Date: Wed, 18 Jun 2025 10:03:11 +0000 Subject: [PATCH 2/3] Add a Collector for `pg_stat_io`. Docs: https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-IO-VIEW We use this in production. One problem is that the stats for a vacuum are reported when the vacuum ends, not continuously. Signed-off-by: Peter Nuttall --- collector/pg_stat_io.go | 325 +++++++++++++++++++++++++++++++++++ collector/pg_stat_io_test.go | 156 +++++++++++++++++ 2 files changed, 481 insertions(+) create mode 100644 collector/pg_stat_io.go create mode 100644 collector/pg_stat_io_test.go diff --git a/collector/pg_stat_io.go b/collector/pg_stat_io.go new file mode 100644 index 000000000..fd7cdb428 --- /dev/null +++ b/collector/pg_stat_io.go @@ -0,0 +1,325 @@ +// Copyright 2023 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "context" + "database/sql" + "log/slog" + + "github.com/blang/semver/v4" + "github.com/prometheus/client_golang/prometheus" +) + +const statIOSubsystem = "stat_io" + +func init() { + registerCollector(statIOSubsystem, defaultDisabled, NewStatIOCollector) +} + +type StatIOCollector struct { + log *slog.Logger +} + +func NewStatIOCollector(config collectorConfig) (Collector, error) { + return &StatIOCollector{ + log: config.logger, + }, nil +} + +var ( + statIOReadsTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "reads_total"), + "Number of read operations, each of the size specified in op_bytes.", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + statIOReadTimeTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "read_time_total"), + "Time spent in read operations in milliseconds (if track_io_timing is enabled, otherwise zero)", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + + statIOWritesTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "writes_total"), + "Number of write operations, each of the size specified in op_bytes.", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + statIOWriteTimeTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "writes_time_total"), + "Time spent in write operations in milliseconds (if track_io_timing is enabled, otherwise zero)", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + + statIOWriteBackTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "write_back_total"), + "Number of units of size op_bytes which the process requested the kernel write out to permanent storage.", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + statIOWriteBackTimeTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "write_back_time_total"), + "Time spent in writeback operations in milliseconds (if track_io_timing is enabled, otherwise zero). This includes the time spent queueing write-out requests and, potentially, the time spent to write out the dirty data.", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + + statIOExtendsTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "extends_total"), + "Number of relation extend operations, each of the size specified in op_bytes.", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + + statIOExtendsTimeTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "extends_time_total"), + "Time spent in extend operations in milliseconds (if track_io_timing is enabled, otherwise zero)", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + + statIOHitsTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "hits_total"), + "The number of times a desired block was found in a shared buffer.", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + statIOEvictionsTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "evictions_total"), + "Number of times a block has been written out from a shared or local buffer in order to make it available for another use.", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + statIOReusesTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "reuses_total"), + "The number of times an existing buffer in a size-limited ring buffer outside of shared buffers was reused as part of an I/O operation in the bulkread, bulkwrite, or vacuum contexts.", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + + statIOFsyncsTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "fsync_total"), + "Number of fsync calls. These are only tracked in context normal.", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + statIOFsyncTimeTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, statIOSubsystem, "fsync_time_total"), + "Time spent in fsync operations in milliseconds (if track_io_timing is enabled, otherwise zero)", + []string{"backend_type", "object", "context"}, + prometheus.Labels{}, + ) + statIOQuery = ` + SELECT + backend_type, + object, + context, + reads, + read_time, + writes, + write_time, + writebacks, + writeback_time, + extends, + extend_time, + hits, + evictions, + reuses, + fsyncs, + fsync_time + + FROM + pg_stat_io + ` +) + +// Update implements Collector and exposes database locks. +// It is called by the Prometheus registry when collecting metrics. +func (c StatIOCollector) Update(ctx context.Context, instance *instance, ch chan<- prometheus.Metric) error { + // pg_stat_io is only in v16, and we don't need support for earlier currently. + if !instance.version.GE(semver.MustParse("16.0.0")) { + return nil + } + db := instance.getDB() + // Query the list of databases + rows, err := db.QueryContext(ctx, statIOQuery) + if err != nil { + return err + } + defer rows.Close() + + var backendType, object, PGContext sql.NullString + var reads, writes, writeBacks, extends, hits, evictions, reuses, fsyncs sql.NullInt64 + var readTime, writeTime, writeBackTime, extendsTime, fsyncTime sql.NullFloat64 + + for rows.Next() { + if err := rows.Scan( + &backendType, &object, &PGContext, + &reads, + &readTime, + &writes, + &writeTime, + &writeBacks, + &writeBackTime, + &extends, + &extendsTime, + &hits, + &evictions, + &reuses, + &fsyncs, + &fsyncTime); err != nil { + return err + } + + if !backendType.Valid || !object.Valid || !PGContext.Valid { + continue + } + + readsMetric := 0.0 + if reads.Valid { + readsMetric = float64(reads.Int64) + } + ch <- prometheus.MustNewConstMetric( + statIOReadsTotal, + prometheus.CounterValue, + readsMetric, + backendType.String, object.String, PGContext.String) + + readTimeMetric := 0.0 + if readTime.Valid { + readTimeMetric = readTime.Float64 + } + ch <- prometheus.MustNewConstMetric( + statIOReadTimeTotal, + prometheus.CounterValue, + readTimeMetric, + backendType.String, object.String, PGContext.String) + + writesMetric := 0.0 + if writes.Valid { + writesMetric = float64(writes.Int64) + } + ch <- prometheus.MustNewConstMetric( + statIOWritesTotal, + prometheus.CounterValue, + writesMetric, + backendType.String, object.String, PGContext.String) + + writeTimeMetric := 0.0 + if writeTime.Valid { + writeTimeMetric = writeTime.Float64 + } + ch <- prometheus.MustNewConstMetric( + statIOWriteTimeTotal, + prometheus.CounterValue, + writeTimeMetric, + backendType.String, object.String, PGContext.String) + + writeBackMetric := 0.0 + if writeBacks.Valid { + writeBackMetric = float64(writeBacks.Int64) + } + ch <- prometheus.MustNewConstMetric( + statIOWriteBackTotal, + prometheus.CounterValue, + writeBackMetric, + backendType.String, object.String, PGContext.String) + + writeBackTimeMetric := 0.0 + if writeBackTime.Valid { + writeBackTimeMetric = writeBackTime.Float64 + } + ch <- prometheus.MustNewConstMetric( + statIOWriteBackTimeTotal, + prometheus.CounterValue, + writeBackTimeMetric, + backendType.String, object.String, PGContext.String) + + extendsMetric := 0.0 + if extends.Valid { + extendsMetric = float64(extends.Int64) + } + ch <- prometheus.MustNewConstMetric( + statIOExtendsTotal, + prometheus.CounterValue, + extendsMetric, + backendType.String, object.String, PGContext.String) + + extendsTimeMetric := 0.0 + if extendsTime.Valid { + extendsTimeMetric = extendsTime.Float64 + } + ch <- prometheus.MustNewConstMetric( + statIOExtendsTimeTotal, + prometheus.CounterValue, + extendsTimeMetric, + backendType.String, object.String, PGContext.String) + + hitsMetric := 0.0 + if hits.Valid { + hitsMetric = float64(hits.Int64) + } + ch <- prometheus.MustNewConstMetric( + statIOHitsTotal, + prometheus.CounterValue, + hitsMetric, + backendType.String, object.String, PGContext.String) + + evictionsMetric := 0.0 + if evictions.Valid { + evictionsMetric = float64(evictions.Int64) + } + ch <- prometheus.MustNewConstMetric( + statIOEvictionsTotal, + prometheus.CounterValue, + evictionsMetric, + backendType.String, object.String, PGContext.String) + + reusesMetric := 0.0 + if reuses.Valid { + reusesMetric = float64(reuses.Int64) + } + ch <- prometheus.MustNewConstMetric( + statIOReusesTotal, + prometheus.CounterValue, + reusesMetric, + backendType.String, object.String, PGContext.String) + + fsyncsMetric := 0.0 + if fsyncs.Valid { + fsyncsMetric = float64(fsyncs.Int64) + } + ch <- prometheus.MustNewConstMetric( + statIOFsyncsTotal, + prometheus.CounterValue, + fsyncsMetric, + backendType.String, object.String, PGContext.String) + + fsyncTimeMetric := 0.0 + if fsyncTime.Valid { + fsyncTimeMetric = fsyncTime.Float64 + } + ch <- prometheus.MustNewConstMetric( + statIOFsyncTimeTotal, + prometheus.CounterValue, + fsyncTimeMetric, + backendType.String, object.String, PGContext.String) + + } + + return rows.Err() +} diff --git a/collector/pg_stat_io_test.go b/collector/pg_stat_io_test.go new file mode 100644 index 000000000..48e1f84a7 --- /dev/null +++ b/collector/pg_stat_io_test.go @@ -0,0 +1,156 @@ +// Copyright 2023 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package collector + +import ( + "context" + "testing" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/blang/semver/v4" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/smartystreets/goconvey/convey" +) + +func TestPGStatIOCollector(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("Error opening a stub db connection: %s", err) + } + defer db.Close() + + inst := &instance{db: db, version: semver.MustParse("16.0.0")} + + columns := []string{ + "backend_type", + "object", + "context", + "reads", + "read_time", + "writes", + "write_time", + "writebacks", + "writeback_time", + "extends", + "extend_time", + "hits", + "evictions", + "reuses", + "fsyncs", + "fsync_time"} + + rows := sqlmock.NewRows(columns). + AddRow("vacuum", "relation", "vacuum", + 45, 3466.5, + 12, 3467.67, + 2, 4.5, + 1, 1.2, + 1234, 3, 56, + 1235, 12.0) + + mock.ExpectQuery(sanitizeQuery(statIOQuery)).WillReturnRows(rows) + + ch := make(chan prometheus.Metric) + go func() { + defer close(ch) + c := StatIOCollector{} + + if err := c.Update(context.Background(), inst, ch); err != nil { + t.Errorf("Error calling PGStatStatementsCollector.Update: %s", err) + } + }() + + expected := []MetricResult{ + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 45}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 3466.5}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 12}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 3467.67}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 2}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 4.5}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 1}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 1.2}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 1234}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 3}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 56}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 1235}, + {labels: labelMap{"backend_type": "vacuum", "object": "relation", "context": "vacuum"}, metricType: dto.MetricType_COUNTER, value: 12.0}, + } + + convey.Convey("Metrics comparison", t, func() { + for _, expect := range expected { + m := readMetric(<-ch) + convey.So(expect, convey.ShouldResemble, m) + } + }) + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("there were unfulfilled exceptions: %s", err) + } +} + +func TestPGStatIOCollectorNull(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("Error opening a stub db connection: %s", err) + } + defer db.Close() + + inst := &instance{db: db, version: semver.MustParse("16.0.0")} + + columns := []string{ + "backend_type", + "object", + "context", + "reads", + "read_time", + "writes", + "write_time", + "writebacks", + "writeback_time", + "extends", + "extend_time", + "hits", + "evictions", + "reuses", + "fsyncs", + "fsync_time"} + + rows := sqlmock.NewRows(columns).AddRow(nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil) + + mock.ExpectQuery(sanitizeQuery(statIOQuery)).WillReturnRows(rows) + + ch := make(chan prometheus.Metric) + go func() { + defer close(ch) + c := StatIOCollector{} + + if err := c.Update(context.Background(), inst, ch); err != nil { + t.Errorf("Error calling PGStatStatementsCollector.Update: %s", err) + } + }() + + expected := []MetricResult{} + + convey.Convey("Metrics comparison", t, func() { + for _, expect := range expected { + m := readMetric(<-ch) + convey.So(expect, convey.ShouldResemble, m) + } + }) + // since we have no expected metrics, wait for the channel to close and then `Update` will have run. + <-ch + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("there were unfulfilled exceptions: %s", err) + } +} From 0253b00b3640ac6b590485d2006ee11db7e59afe Mon Sep 17 00:00:00 2001 From: Peter Nuttall Date: Thu, 19 Jun 2025 14:23:55 +0000 Subject: [PATCH 3/3] copyright Signed-off-by: Peter Nuttall --- collector/pg_stat_io.go | 2 +- collector/pg_stat_io_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/collector/pg_stat_io.go b/collector/pg_stat_io.go index fd7cdb428..ceea91d39 100644 --- a/collector/pg_stat_io.go +++ b/collector/pg_stat_io.go @@ -1,4 +1,4 @@ -// Copyright 2023 The Prometheus Authors +// Copyright The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at diff --git a/collector/pg_stat_io_test.go b/collector/pg_stat_io_test.go index 48e1f84a7..9c5d63fce 100644 --- a/collector/pg_stat_io_test.go +++ b/collector/pg_stat_io_test.go @@ -1,4 +1,4 @@ -// Copyright 2023 The Prometheus Authors +// Copyright The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at