From 1f1575fd6e91860ad9c7ad17a8be5a7c6079be9f Mon Sep 17 00:00:00 2001 From: Filip Petkovski Date: Mon, 6 May 2024 11:15:05 +0200 Subject: [PATCH] Add timeout to endpointset metric collector We have seen deadlocks with endpoint discovery caused by the metric collector hanging and not releasing the store labels lock. This causes the endpoint update to hang, which also makes all endpoint readers hang on acquiring a read lock for the resolved endpoints slice. This commit makes sure the Collect method on the metrics collector has a built in timeout to guard against cases where an upstream call never reads from the collection channel. Signed-off-by: Filip Petkovski --- pkg/query/endpointset.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pkg/query/endpointset.go b/pkg/query/endpointset.go index a8005c571d..f0daa86800 100644 --- a/pkg/query/endpointset.go +++ b/pkg/query/endpointset.go @@ -197,15 +197,17 @@ type endpointSetNodeCollector struct { storeNodes map[component.Component]map[string]int storePerExtLset map[string]int + logger log.Logger connectionsDesc *prometheus.Desc labels []string } -func newEndpointSetNodeCollector(labels ...string) *endpointSetNodeCollector { +func newEndpointSetNodeCollector(logger log.Logger, labels ...string) *endpointSetNodeCollector { if len(labels) == 0 { labels = []string{string(ExternalLabels), string(StoreType)} } return &endpointSetNodeCollector{ + logger: logger, storeNodes: map[component.Component]map[string]int{}, connectionsDesc: prometheus.NewDesc( "thanos_store_nodes_grpc_connections", @@ -272,7 +274,12 @@ func (c *endpointSetNodeCollector) Collect(ch chan<- prometheus.Metric) { lbls = append(lbls, storeTypeStr) } } - ch <- prometheus.MustNewConstMetric(c.connectionsDesc, prometheus.GaugeValue, float64(occurrences), lbls...) + select { + case ch <- prometheus.MustNewConstMetric(c.connectionsDesc, prometheus.GaugeValue, float64(occurrences), lbls...): + case <-time.After(1 * time.Second): + level.Warn(c.logger).Log("msg", "failed to collect endpointset metrics", "timeout", 1*time.Second) + return + } } } } @@ -314,7 +321,7 @@ func NewEndpointSet( endpointInfoTimeout time.Duration, endpointMetricLabels ...string, ) *EndpointSet { - endpointsMetric := newEndpointSetNodeCollector(endpointMetricLabels...) + endpointsMetric := newEndpointSetNodeCollector(logger, endpointMetricLabels...) if reg != nil { reg.MustRegister(endpointsMetric) }