Skip to content
This repository has been archived by the owner on Apr 2, 2024. It is now read-only.

Commit

Permalink
Since query timeout and cancel use the same error code we need
Browse files Browse the repository at this point in the history
an additional metric label to track what happened. This should also help with
having better alerting control.
  • Loading branch information
niksajakovljevic committed Dec 23, 2022
1 parent 15c0283 commit 9ce3f73
Show file tree
Hide file tree
Showing 13 changed files with 75 additions and 50 deletions.
6 changes: 3 additions & 3 deletions pkg/api/alerts.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@ import (
"github.com/NYTimes/gziphandler"
)

func Alerts(conf *Config, updateMetrics func(handler, code string, duration float64)) http.Handler {
func Alerts(conf *Config, updateMetrics updateMetricCallback) http.Handler {
hf := corsWrapper(conf, alertsHandler(conf, updateMetrics))
return gziphandler.GzipHandler(hf)
}

func alertsHandler(apiConf *Config, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc {
func alertsHandler(apiConf *Config, updateMetrics updateMetricCallback) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
statusCode := "400"
begin := time.Now()
defer func() {
updateMetrics("/api/v1/alerts", statusCode, time.Since(begin).Seconds())
updateMetrics("/api/v1/alerts", statusCode, "", time.Since(begin).Seconds())
}()

if apiConf.Rules == nil {
Expand Down
4 changes: 2 additions & 2 deletions pkg/api/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ func updateIngestMetrics(code string, duration, receivedSamples, receivedMetadat
pgMetrics.IngestorItemsReceived.With(prometheus.Labels{"type": "metric", "kind": "metadata"}).Observe(receivedMetadata)
}

func updateQueryMetrics(handler, code string, duration float64) {
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "code": code, "handler": handler}).Inc()
func updateQueryMetrics(handler, code, err string, duration float64) {
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "code": code, "handler": handler, "err": err}).Inc()
pgMetrics.QueryDuration.With(prometheus.Labels{"type": "metric", "code": code, "handler": handler}).Observe(duration)
}

Expand Down
13 changes: 8 additions & 5 deletions pkg/api/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,18 @@ import (
"github.com/timescale/promscale/pkg/promql"
)

func Query(conf *Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.Handler {
func Query(conf *Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics updateMetricCallback) http.Handler {
hf := corsWrapper(conf, queryHandler(queryEngine, queryable, updateMetrics))
return gziphandler.GzipHandler(hf)
}

func queryHandler(queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc {
func queryHandler(queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics updateMetricCallback) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
statusCode := "400"
errReason := ""
begin := time.Now()
defer func() {
updateMetrics("/api/v1/query", statusCode, time.Since(begin).Seconds())
updateMetrics("/api/v1/query", statusCode, errReason, time.Since(begin).Seconds())
}()
var ts time.Time
var err error
Expand Down Expand Up @@ -63,11 +64,13 @@ func queryHandler(queryEngine *promql.Engine, queryable promql.Queryable, update
switch res.Err.(type) {
case promql.ErrQueryCanceled:
statusCode = "503"
respondError(w, http.StatusServiceUnavailable, res.Err, "canceled")
errReason = errCanceled
respondError(w, http.StatusServiceUnavailable, res.Err, errCanceled)
return
case promql.ErrQueryTimeout:
statusCode = "503"
respondError(w, http.StatusServiceUnavailable, res.Err, "timeout")
errReason = errTimeout
respondError(w, http.StatusServiceUnavailable, res.Err, errTimeout)
return
case promql.ErrStorage:
statusCode = "500"
Expand Down
6 changes: 3 additions & 3 deletions pkg/api/query_exemplar.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@ import (
"github.com/timescale/promscale/pkg/promql"
)

func QueryExemplar(conf *Config, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.Handler {
func QueryExemplar(conf *Config, queryable promql.Queryable, updateMetrics updateMetricCallback) http.Handler {
hf := corsWrapper(conf, queryExemplar(queryable, updateMetrics))
return gziphandler.GzipHandler(hf)
}

func queryExemplar(queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc {
func queryExemplar(queryable promql.Queryable, updateMetrics updateMetricCallback) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
statusCode := "400"
begin := time.Now()
defer func() {
updateMetrics("/api/v1/query_exemplars", statusCode, time.Since(begin).Seconds())
updateMetrics("/api/v1/query_exemplars", statusCode, "", time.Since(begin).Seconds())
}()
start, err := parseTime(r.FormValue("start"))
if err != nil {
Expand Down
13 changes: 8 additions & 5 deletions pkg/api/query_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,18 @@ import (
"github.com/timescale/promscale/pkg/query"
)

func QueryRange(conf *Config, promqlConf *query.Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.Handler {
func QueryRange(conf *Config, promqlConf *query.Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics updateMetricCallback) http.Handler {
hf := corsWrapper(conf, queryRange(promqlConf, queryEngine, queryable, updateMetrics))
return gziphandler.GzipHandler(hf)
}

func queryRange(promqlConf *query.Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc {
func queryRange(promqlConf *query.Config, queryEngine *promql.Engine, queryable promql.Queryable, updateMetrics updateMetricCallback) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
statusCode := "400"
errReason := ""
begin := time.Now()
defer func() {
updateMetrics("/api/v1/query_range", statusCode, time.Since(begin).Seconds())
updateMetrics("/api/v1/query_range", statusCode, errReason, time.Since(begin).Seconds())
}()
start, err := parseTime(r.FormValue("start"))
if err != nil {
Expand Down Expand Up @@ -109,11 +110,13 @@ func queryRange(promqlConf *query.Config, queryEngine *promql.Engine, queryable
switch res.Err.(type) {
case promql.ErrQueryCanceled:
statusCode = "503"
respondError(w, http.StatusServiceUnavailable, res.Err, "canceled")
errReason = errCanceled
respondError(w, http.StatusServiceUnavailable, res.Err, errCanceled)
return
case promql.ErrQueryTimeout:
statusCode = "503"
respondError(w, http.StatusServiceUnavailable, res.Err, "timeout")
errReason = errTimeout
respondError(w, http.StatusServiceUnavailable, res.Err, errTimeout)
return
case promql.ErrStorage:
statusCode = "500"
Expand Down
4 changes: 2 additions & 2 deletions pkg/api/query_range_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func TestRangedQuery(t *testing.T) {
end: "2",
step: "1s",
expectCode: http.StatusServiceUnavailable,
expectError: "timeout",
expectError: errTimeout,
timeout: "1s",
metric: "m",
querier: &mockQuerier{
Expand All @@ -125,7 +125,7 @@ func TestRangedQuery(t *testing.T) {
end: "2",
step: "1s",
expectCode: http.StatusServiceUnavailable,
expectError: "canceled",
expectError: errCanceled,
metric: "m",
querier: &mockQuerier{},
canceled: true,
Expand Down
4 changes: 2 additions & 2 deletions pkg/api/query_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ func TestQuery(t *testing.T) {
}, {
name: "Timeout query",
expectCode: http.StatusServiceUnavailable,
expectError: "timeout",
expectError: errTimeout,
timeout: "1s",
metric: "m",
querier: &mockQuerier{
Expand All @@ -195,7 +195,7 @@ func TestQuery(t *testing.T) {
}, {
name: "Cancel query",
expectCode: http.StatusServiceUnavailable,
expectError: "canceled",
expectError: errCanceled,
metric: "m",
querier: &mockQuerier{},
canceled: true,
Expand Down
4 changes: 2 additions & 2 deletions pkg/api/read.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ import (
"github.com/timescale/promscale/pkg/prompb"
)

func Read(config *Config, reader querier.Reader, metrics *Metrics, updateMetrics func(handler, code string, duration float64)) http.Handler {
func Read(config *Config, reader querier.Reader, metrics *Metrics, updateMetrics updateMetricCallback) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
statusCode := "400"
begin := time.Now()
defer func() {
updateMetrics("/read", statusCode, time.Since(begin).Seconds())
updateMetrics("/read", statusCode, "", time.Since(begin).Seconds())
}()
if !validateReadHeaders(w, r) {
return
Expand Down
23 changes: 15 additions & 8 deletions pkg/api/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ import (
"github.com/timescale/promscale/pkg/telemetry"
)

type updateMetricCallback func(handler, code, err string, duration float64)

const (
errTimeout = "timeout"
errCanceled = "canceled"
)

// TODO: Refactor this function to reduce number of paramaters.
func GenerateRouter(apiConf *Config, promqlConf *query.Config, client *pgclient.Client, store *jaegerStore.Store, authWrapper mux.MiddlewareFunc, reload func() error) (*mux.Router, error) {
var writePreprocessors []parser.Preprocessor
Expand Down Expand Up @@ -132,24 +139,24 @@ func RegisterTelemetryMetrics(t telemetry.Engine) error {
}
if err = t.RegisterMetric(
"promscale_metrics_queries_failed_total",
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "422"}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "422"}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "500"}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "500"}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "422", "err": ""}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "422", "err": ""}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "500", "err": ""}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "500", "err": ""}),
); err != nil {
return fmt.Errorf("register 'promscale_metrics_queries_failed_total' metric for telemetry: %w", err)
}
if err = t.RegisterMetric(
"promscale_metrics_queries_success_total",
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "2xx"}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "2xx"}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "2xx", "err": ""}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "2xx", "err": ""}),
); err != nil {
return fmt.Errorf("register 'promscale_metrics_queries_success_total' metric for telemetry: %w", err)
}
if err = t.RegisterMetric(
"promscale_metrics_queries_timedout_total",
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "503"}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "503"}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query", "code": "503", "err": ""}),
pgMetrics.Query.With(prometheus.Labels{"type": "metric", "handler": "/api/v1/query_range", "code": "503", "err": ""}),
); err != nil {
return fmt.Errorf("register 'promscale_metrics_queries_timedout_total' metric for telemetry: %w", err)
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/api/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
"github.com/timescale/promscale/pkg/log"
)

func Rules(conf *Config, updateMetrics func(handler, code string, duration float64)) http.Handler {
func Rules(conf *Config, updateMetrics updateMetricCallback) http.Handler {
hf := corsWrapper(conf, rulesHandler(conf, updateMetrics))
return gziphandler.GzipHandler(hf)
}
Expand Down Expand Up @@ -96,12 +96,12 @@ type Alert struct {
Value string `json:"value"`
}

func rulesHandler(apiConf *Config, updateMetrics func(handler, code string, duration float64)) http.HandlerFunc {
func rulesHandler(apiConf *Config, updateMetrics updateMetricCallback) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
statusCode := "400"
begin := time.Now()
defer func() {
updateMetrics("/api/v1/rules", statusCode, time.Since(begin).Seconds())
updateMetrics("/api/v1/rules", statusCode, "", time.Since(begin).Seconds())
}()

queryType := strings.ToLower(r.URL.Query().Get("type"))
Expand Down
4 changes: 2 additions & 2 deletions pkg/api/write_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,8 @@ func mockUpdaterForIngest(counter, histogram, numSamples, numMetadata *mockMetri
}
}

func mockUpdaterForQuery(counter, histogram *mockMetric) func(handler, code string, duration float64) {
return func(_, _ string, duration float64) {
func mockUpdaterForQuery(counter, histogram *mockMetric) updateMetricCallback {
return func(_, _, _ string, duration float64) {
counter.value++
applyValueIfMetricNotNil(histogram, duration)
}
Expand Down
36 changes: 24 additions & 12 deletions pkg/jaeger/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ func (p *Store) GetTrace(ctx context.Context, traceID model.TraceID) (*model.Tra
code := "5xx"
start := time.Now()
defer func() {
metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Get_Trace", "code": code}).Inc()
metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Get_Trace", "code": code}).Observe(time.Since(start).Seconds())
labels := prometheus.Labels{"type": "trace", "handler": "Get_Trace", "code": code, "err": ""}
metrics.Query.With(labels).Inc()
delete(labels, "err")
metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds())
}()
res, err := getTrace(ctx, p.builder, p.conn, traceID)

Expand All @@ -86,8 +88,10 @@ func (p *Store) GetServices(ctx context.Context) ([]string, error) {
code := "5xx"
start := time.Now()
defer func() {
metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Get_Services", "code": code}).Inc()
metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Get_Services", "code": code}).Observe(time.Since(start).Seconds())
labels := prometheus.Labels{"type": "trace", "handler": "Get_Services", "code": code, "err": ""}
metrics.Query.With(labels).Inc()
delete(labels, "err")
metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds())
}()
res, err := getServices(ctx, p.conn)
if err != nil {
Expand All @@ -101,8 +105,10 @@ func (p *Store) GetOperations(ctx context.Context, query spanstore.OperationQuer
code := "5xx"
start := time.Now()
defer func() {
metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Get_Operations", "code": code}).Inc()
metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Get_Operations", "code": code}).Observe(time.Since(start).Seconds())
labels := prometheus.Labels{"type": "trace", "handler": "Get_Operations", "code": code, "err": ""}
metrics.Query.With(labels).Inc()
delete(labels, "err")
metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds())
}()
res, err := getOperations(ctx, p.conn, query)
if err != nil {
Expand All @@ -116,8 +122,10 @@ func (p *Store) FindTraces(ctx context.Context, query *spanstore.TraceQueryParam
code := "5xx"
start := time.Now()
defer func() {
metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Find_Traces", "code": code}).Inc()
metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Find_Traces", "code": code}).Observe(time.Since(start).Seconds())
labels := prometheus.Labels{"type": "trace", "handler": "Find_Traces", "code": code, "err": ""}
metrics.Query.With(labels).Inc()
delete(labels, "err")
metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds())
}()
res, err := findTraces(ctx, p.builder, p.conn, query)
if err != nil {
Expand All @@ -132,8 +140,10 @@ func (p *Store) FindTraceIDs(ctx context.Context, query *spanstore.TraceQueryPar
code := "5xx"
start := time.Now()
defer func() {
metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Find_Trace_IDs", "code": code}).Inc()
metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Find_Trace_IDs", "code": code}).Observe(time.Since(start).Seconds())
labels := prometheus.Labels{"type": "trace", "handler": "Find_Trace_IDs", "code": code, "err": ""}
metrics.Query.With(labels).Inc()
delete(labels, "err")
metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds())
}()
res, err := findTraceIDs(ctx, p.builder, p.conn, query)
if err != nil {
Expand All @@ -148,8 +158,10 @@ func (p *Store) GetDependencies(ctx context.Context, endTs time.Time, lookback t
code := "5xx"
start := time.Now()
defer func() {
metrics.Query.With(prometheus.Labels{"type": "trace", "handler": "Get_Dependencies", "code": code}).Inc()
metrics.QueryDuration.With(prometheus.Labels{"type": "trace", "handler": "Get_Dependencies", "code": code}).Observe(time.Since(start).Seconds())
labels := prometheus.Labels{"type": "trace", "handler": "Get_Dependencies", "code": code, "err": ""}
metrics.Query.With(labels).Inc()
delete(labels, "err")
metrics.QueryDuration.With(labels).Observe(time.Since(start).Seconds())
}()

res, err := getDependencies(ctx, p.conn, endTs, lookback)
Expand Down
2 changes: 1 addition & 1 deletion pkg/pgmodel/metrics/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ var (
Subsystem: "query",
Name: "requests_total",
Help: "Number of query requests to Promscale.",
}, []string{"type", "handler", "code"},
}, []string{"type", "handler", "code", "err"},
)
)

Expand Down

0 comments on commit 9ce3f73

Please sign in to comment.