From 9ce3f73ca3d12ef6ab13a282ecb319ac5a1ff1a3 Mon Sep 17 00:00:00 2001 From: niksajakovljevic Date: Tue, 20 Dec 2022 10:51:04 +0100 Subject: [PATCH] Since query timeout and cancel use the same error code we need an additional metric label to track what happened. This should also help with having better alerting control. --- pkg/api/alerts.go | 6 +++--- pkg/api/metrics.go | 4 ++-- pkg/api/query.go | 13 ++++++++----- pkg/api/query_exemplar.go | 6 +++--- pkg/api/query_range.go | 13 ++++++++----- pkg/api/query_range_test.go | 4 ++-- pkg/api/query_test.go | 4 ++-- pkg/api/read.go | 4 ++-- pkg/api/router.go | 23 +++++++++++++++-------- pkg/api/rules.go | 6 +++--- pkg/api/write_test.go | 4 ++-- pkg/jaeger/store/store.go | 36 ++++++++++++++++++++++++------------ pkg/pgmodel/metrics/query.go | 2 +- 13 files changed, 75 insertions(+), 50 deletions(-) diff --git a/pkg/api/alerts.go b/pkg/api/alerts.go index 6b8bbc1759..d2dc7baa73 100644 --- a/pkg/api/alerts.go +++ b/pkg/api/alerts.go @@ -11,17 +11,17 @@ import ( "github.com/NYTimes/gziphandler" ) -func Alerts(conf *Config, updateMetrics func(handler, code string, duration float64)) http.Handler { +func Alerts(conf *Config, updateMetrics updateMetricCallback) http.Handler { hf := corsWrapper(conf, alertsHandler(conf, updateMetrics)) return gziphandler.GzipHandler(hf) } -func alertsHandler(apiConf *Config, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc { +func alertsHandler(apiConf *Config, updateMetrics updateMetricCallback) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { statusCode := "400" begin := time.Now() defer func() { - updateMetrics("/api/v1/alerts", statusCode, time.Since(begin).Seconds()) + updateMetrics("/api/v1/alerts", statusCode, "", time.Since(begin).Seconds()) }() if apiConf.Rules == nil { diff --git a/pkg/api/metrics.go b/pkg/api/metrics.go index 86dadc668c..b231681b8d 100644 --- a/pkg/api/metrics.go +++ b/pkg/api/metrics.go @@ -30,8 +30,8 @@ func updateIngestMetrics(code string, duration, receivedSamples, receivedMetadat pgMetrics.IngestorItemsReceived.With(prometheus.Labels{"type": "metric", "kind": "metadata"}).Observe(receivedMetadata) } -func updateQueryMetrics(handler, code string, duration float64) { - pgMetrics.Query.With(prometheus.Labels{"type": "metric", "code": code, "handler": handler}).Inc() +func updateQueryMetrics(handler, code, err string, duration float64) { + pgMetrics.Query.With(prometheus.Labels{"type": "metric", "code": code, "handler": handler, "err": err}).Inc() pgMetrics.QueryDuration.With(prometheus.Labels{"type": "metric", "code": code, "handler": handler}).Observe(duration) } diff --git a/pkg/api/query.go b/pkg/api/query.go index ca8f290e44..531458458c 100644 --- a/pkg/api/query.go +++ b/pkg/api/query.go @@ -15,17 +15,18 @@ import ( "github.com/timescale/promscale/pkg/promql" ) -func Query(conf *Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.Handler { +func Query(conf *Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics updateMetricCallback) http.Handler { hf := corsWrapper(conf, queryHandler(queryEngine, queryable, updateMetrics)) return gziphandler.GzipHandler(hf) } -func queryHandler(queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc { +func queryHandler(queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics updateMetricCallback) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { statusCode := "400" + errReason := "" begin := time.Now() defer func() { - updateMetrics("/api/v1/query", statusCode, time.Since(begin).Seconds()) + updateMetrics("/api/v1/query", statusCode, errReason, time.Since(begin).Seconds()) }() var ts time.Time var err error @@ -63,11 +64,13 @@ func queryHandler(queryEngine *promql.Engine, queryable promql.Queryable, update switch res.Err.(type) { case promql.ErrQueryCanceled: statusCode = "503" - respondError(w, http.StatusServiceUnavailable, res.Err, "canceled") + errReason = errCanceled + respondError(w, http.StatusServiceUnavailable, res.Err, errCanceled) return case promql.ErrQueryTimeout: statusCode = "503" - respondError(w, http.StatusServiceUnavailable, res.Err, "timeout") + errReason = errTimeout + respondError(w, http.StatusServiceUnavailable, res.Err, errTimeout) return case promql.ErrStorage: statusCode = "500" diff --git a/pkg/api/query_exemplar.go b/pkg/api/query_exemplar.go index 0f366ce41c..8cc3374779 100644 --- a/pkg/api/query_exemplar.go +++ b/pkg/api/query_exemplar.go @@ -17,17 +17,17 @@ import ( "github.com/timescale/promscale/pkg/promql" ) -func QueryExemplar(conf *Config, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.Handler { +func QueryExemplar(conf *Config, queryable promql.Queryable, updateMetrics updateMetricCallback) http.Handler { hf := corsWrapper(conf, queryExemplar(queryable, updateMetrics)) return gziphandler.GzipHandler(hf) } -func queryExemplar(queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc { +func queryExemplar(queryable promql.Queryable, updateMetrics updateMetricCallback) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { statusCode := "400" begin := time.Now() defer func() { - updateMetrics("/api/v1/query_exemplars", statusCode, time.Since(begin).Seconds()) + updateMetrics("/api/v1/query_exemplars", statusCode, "", time.Since(begin).Seconds()) }() start, err := parseTime(r.FormValue("start")) if err != nil { diff --git a/pkg/api/query_range.go b/pkg/api/query_range.go index 971e0af602..dba3b6c395 100644 --- a/pkg/api/query_range.go +++ b/pkg/api/query_range.go @@ -18,17 +18,18 @@ import ( "github.com/timescale/promscale/pkg/query" ) -func QueryRange(conf *Config, promqlConf *query.Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.Handler { +func QueryRange(conf *Config, promqlConf *query.Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics updateMetricCallback) http.Handler { hf := corsWrapper(conf, queryRange(promqlConf, queryEngine, queryable, updateMetrics)) return gziphandler.GzipHandler(hf) } -func queryRange(promqlConf *query.Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc { +func queryRange(promqlConf *query.Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics updateMetricCallback) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { statusCode := "400" + errReason := "" begin := time.Now() defer func() { - updateMetrics("/api/v1/query_range", statusCode, time.Since(begin).Seconds()) + updateMetrics("/api/v1/query_range", statusCode, errReason, time.Since(begin).Seconds()) }() start, err := parseTime(r.FormValue("start")) if err != nil { @@ -109,11 +110,13 @@ func queryRange(promqlConf *query.Config, queryEngine *promql.Engine, queryable switch res.Err.(type) { case promql.ErrQueryCanceled: statusCode = "503" - respondError(w, http.StatusServiceUnavailable, res.Err, "canceled") + errReason = errCanceled + respondError(w, http.StatusServiceUnavailable, res.Err, errCanceled) return case promql.ErrQueryTimeout: statusCode = "503" - respondError(w, http.StatusServiceUnavailable, res.Err, "timeout") + errReason = errTimeout + respondError(w, http.StatusServiceUnavailable, res.Err, errTimeout) return case promql.ErrStorage: statusCode = "500" diff --git a/pkg/api/query_range_test.go b/pkg/api/query_range_test.go index 6336cca9cb..696646f1cc 100644 --- a/pkg/api/query_range_test.go +++ b/pkg/api/query_range_test.go @@ -113,7 +113,7 @@ func TestRangedQuery(t *testing.T) { end: "2", step: "1s", expectCode: http.StatusServiceUnavailable, - expectError: "timeout", + expectError: errTimeout, timeout: "1s", metric: "m", querier: &mockQuerier{ @@ -125,7 +125,7 @@ func TestRangedQuery(t *testing.T) { end: "2", step: "1s", expectCode: http.StatusServiceUnavailable, - expectError: "canceled", + expectError: errCanceled, metric: "m", querier: &mockQuerier{}, canceled: true, diff --git a/pkg/api/query_test.go b/pkg/api/query_test.go index 68cbd89d5c..d348b4dcd5 100644 --- a/pkg/api/query_test.go +++ b/pkg/api/query_test.go @@ -186,7 +186,7 @@ func TestQuery(t *testing.T) { }, { name: "Timeout query", expectCode: http.StatusServiceUnavailable, - expectError: "timeout", + expectError: errTimeout, timeout: "1s", metric: "m", querier: &mockQuerier{ @@ -195,7 +195,7 @@ func TestQuery(t *testing.T) { }, { name: "Cancel query", expectCode: http.StatusServiceUnavailable, - expectError: "canceled", + expectError: errCanceled, metric: "m", querier: &mockQuerier{}, canceled: true, diff --git a/pkg/api/read.go b/pkg/api/read.go index 1954af0b4e..0f4c31612a 100644 --- a/pkg/api/read.go +++ b/pkg/api/read.go @@ -19,12 +19,12 @@ import ( "github.com/timescale/promscale/pkg/prompb" ) -func Read(config *Config, reader querier.Reader, metrics *Metrics, updateMetrics func(handler, code string, duration float64)) http.Handler { +func Read(config *Config, reader querier.Reader, metrics *Metrics, updateMetrics updateMetricCallback) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { statusCode := "400" begin := time.Now() defer func() { - updateMetrics("/read", statusCode, time.Since(begin).Seconds()) + updateMetrics("/read", statusCode, "", time.Since(begin).Seconds()) }() if !validateReadHeaders(w, r) { return diff --git a/pkg/api/router.go b/pkg/api/router.go index 42dba3f007..adf1579854 100644 --- a/pkg/api/router.go +++ b/pkg/api/router.go @@ -28,6 +28,13 @@ import ( "github.com/timescale/promscale/pkg/telemetry" ) +type updateMetricCallback func(handler, code, err string, duration float64) + +const ( + errTimeout = "timeout" + errCanceled = "canceled" +) + // TODO: Refactor this function to reduce number of paramaters. func GenerateRouter(apiConf *Config, promqlConf *query.Config, client *pgclient.Client, store *jaegerStore.Store, authWrapper mux.MiddlewareFunc, reload func() error) (*mux.Router, error) { var writePreprocessors []parser.Preprocessor @@ -132,24 +139,24 @@ func RegisterTelemetryMetrics(t telemetry.Engine) error { } if err = t.RegisterMetric( "promscale_metrics_queries_failed_total", - pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "422"}), - pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "422"}), - pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "500"}), - pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "500"}), + pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "422", "err": ""}), + pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "422", "err": ""}), + pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "500", "err": ""}), + pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "500", "err": ""}), ); err != nil { return fmt.Errorf("register 'promscale_metrics_queries_failed_total' metric for telemetry: %w", err) } if err = t.RegisterMetric( "promscale_metrics_queries_success_total", - pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "2xx"}), - pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "2xx"}), + pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "2xx", "err": ""}), + pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "2xx", "err": ""}), ); err != nil { return fmt.Errorf("register 'promscale_metrics_queries_success_total' metric for telemetry: %w", err) } if err = t.RegisterMetric( "promscale_metrics_queries_timedout_total", - pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "503"}), - pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "503"}), + pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "503", "err": ""}), + pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "503", "err": ""}), ); err != nil { return fmt.Errorf("register 'promscale_metrics_queries_timedout_total' metric for telemetry: %w", err) } diff --git a/pkg/api/rules.go b/pkg/api/rules.go index e5e98c36bb..544a6ca00d 100644 --- a/pkg/api/rules.go +++ b/pkg/api/rules.go @@ -19,7 +19,7 @@ import ( "github.com/timescale/promscale/pkg/log" ) -func Rules(conf *Config, updateMetrics func(handler, code string, duration float64)) http.Handler { +func Rules(conf *Config, updateMetrics updateMetricCallback) http.Handler { hf := corsWrapper(conf, rulesHandler(conf, updateMetrics)) return gziphandler.GzipHandler(hf) } @@ -96,12 +96,12 @@ type Alert struct { Value string `json:"value"` } -func rulesHandler(apiConf *Config, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc { +func rulesHandler(apiConf *Config, updateMetrics updateMetricCallback) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { statusCode := "400" begin := time.Now() defer func() { - updateMetrics("/api/v1/rules", statusCode, time.Since(begin).Seconds()) + updateMetrics("/api/v1/rules", statusCode, "", time.Since(begin).Seconds()) }() queryType := strings.ToLower(r.URL.Query().Get("type")) diff --git a/pkg/api/write_test.go b/pkg/api/write_test.go index 7f0a71e662..dd35465733 100644 --- a/pkg/api/write_test.go +++ b/pkg/api/write_test.go @@ -366,8 +366,8 @@ func mockUpdaterForIngest(counter, histogram, numSamples, numMetadata *mockMetri } } -func mockUpdaterForQuery(counter, histogram *mockMetric) func(handler, code string, duration float64) { - return func(_, _ string, duration float64) { +func mockUpdaterForQuery(counter, histogram *mockMetric) updateMetricCallback { + return func(_, _, _ string, duration float64) { counter.value++ applyValueIfMetricNotNil(histogram, duration) } diff --git a/pkg/jaeger/store/store.go b/pkg/jaeger/store/store.go index 60863fe55b..0e9702f39a 100644 --- a/pkg/jaeger/store/store.go +++ b/pkg/jaeger/store/store.go @@ -65,8 +65,10 @@ func (p *Store) GetTrace(ctx context.Context, traceID model.TraceID) (*model.Tra code := "5xx" start := time.Now() defer func() { - metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Get_Trace", "code": code}).Inc() - metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Get_Trace", "code": code}).Observe(time.Since(start).Seconds()) + labels := prometheus.Labels{"type": "trace", "handler": "Get_Trace", "code": code, "err": ""} + metrics.Query.With(labels).Inc() + delete(labels, "err") + metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds()) }() res, err := getTrace(ctx, p.builder, p.conn, traceID) @@ -86,8 +88,10 @@ func (p *Store) GetServices(ctx context.Context) ([]string, error) { code := "5xx" start := time.Now() defer func() { - metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Get_Services", "code": code}).Inc() - metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Get_Services", "code": code}).Observe(time.Since(start).Seconds()) + labels := prometheus.Labels{"type": "trace", "handler": "Get_Services", "code": code, "err": ""} + metrics.Query.With(labels).Inc() + delete(labels, "err") + metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds()) }() res, err := getServices(ctx, p.conn) if err != nil { @@ -101,8 +105,10 @@ func (p *Store) GetOperations(ctx context.Context, query spanstore.OperationQuer code := "5xx" start := time.Now() defer func() { - metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Get_Operations", "code": code}).Inc() - metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Get_Operations", "code": code}).Observe(time.Since(start).Seconds()) + labels := prometheus.Labels{"type": "trace", "handler": "Get_Operations", "code": code, "err": ""} + metrics.Query.With(labels).Inc() + delete(labels, "err") + metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds()) }() res, err := getOperations(ctx, p.conn, query) if err != nil { @@ -116,8 +122,10 @@ func (p *Store) FindTraces(ctx context.Context, query *spanstore.TraceQueryParam code := "5xx" start := time.Now() defer func() { - metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Find_Traces", "code": code}).Inc() - metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Find_Traces", "code": code}).Observe(time.Since(start).Seconds()) + labels := prometheus.Labels{"type": "trace", "handler": "Find_Traces", "code": code, "err": ""} + metrics.Query.With(labels).Inc() + delete(labels, "err") + metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds()) }() res, err := findTraces(ctx, p.builder, p.conn, query) if err != nil { @@ -132,8 +140,10 @@ func (p *Store) FindTraceIDs(ctx context.Context, query *spanstore.TraceQueryPar code := "5xx" start := time.Now() defer func() { - metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Find_Trace_IDs", "code": code}).Inc() - metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Find_Trace_IDs", "code": code}).Observe(time.Since(start).Seconds()) + labels := prometheus.Labels{"type": "trace", "handler": "Find_Trace_IDs", "code": code, "err": ""} + metrics.Query.With(labels).Inc() + delete(labels, "err") + metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds()) }() res, err := findTraceIDs(ctx, p.builder, p.conn, query) if err != nil { @@ -148,8 +158,10 @@ func (p *Store) GetDependencies(ctx context.Context, endTs time.Time, lookback t code := "5xx" start := time.Now() defer func() { - metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Get_Dependencies", "code": code}).Inc() - metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Get_Dependencies", "code": code}).Observe(time.Since(start).Seconds()) + labels := prometheus.Labels{"type": "trace", "handler": "Get_Dependencies", "code": code, "err": ""} + metrics.Query.With(labels).Inc() + delete(labels, "err") + metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds()) }() res, err := getDependencies(ctx, p.conn, endTs, lookback) diff --git a/pkg/pgmodel/metrics/query.go b/pkg/pgmodel/metrics/query.go index 1d5729bcef..e2b932daa9 100644 --- a/pkg/pgmodel/metrics/query.go +++ b/pkg/pgmodel/metrics/query.go @@ -21,7 +21,7 @@ var ( Subsystem: "query", Name: "requests_total", Help: "Number of query requests to Promscale.", - }, []string{"type", "handler", "code"}, + }, []string{"type", "handler", "code", "err"}, ) )