Emit per workload labels for existing per table vttablet metrics (vit…

…essio#12394) * Emit per workload labels for existing per table vttablet metrics This adds the possibility to configure vttablet (via CLI flag) to also have a workload label for existing per table metrics (query counts, query times, query errors, query rows affected, query rows returned, query error counts). Workload can be any string that makes sense for the client application. For example, API endpoint name, controller, batch job name, application name or something else. This is usefult to be able to gain observability about how the query load is distributed across different workloads. This is achieved with two new CLI flags, namely: * `enable-per-workload-table-metrics`: whether to enable or disable per workload metric collection - disabled by default to preserve the current behavior, thus making the new feature opt-in only. * `workload-label`: a string to look for in query comments to identify the workload running the current query. The workload is obtained by parsing query comments of the form: /* ... <workload_label>=<workload_name>; ... */ For example, if vttablet is started with `--enable-per-workload-table-metrics --workload-label app_name` anda query is issued with a comment like /* ... app_name=shop; ... */ then metrics will look like ``` vttablet_query_counts{plan="Select",table="dual", workload="shop"} 15479 ``` instead of ``` vttablet_query_counts{plan="Select",table="dual"} 15479 ``` Query comment parsing only takes place if `--enable-per-workload-table-metrics` is used, as to not incur parsing performance impact if the user does not want per workload metrics. Signed-off-by: Eduardo J. Ortega U <[email protected]> * make linter happy Signed-off-by: Eduardo J. Ortega U <[email protected]> * fix flags e2e test Signed-off-by: Eduardo J. Ortega U <[email protected]> * Address PR comments: * Obtain workload information on the vtgate instead of the vttablet, avoiding double parsing. * Treat workload name as a query directive. * Send workload name from vtgate to vttablet as ExecuteOptions. Additionally, annotate tabletserver's execution span with the workload name to also enrich traces with workload name data, in addition to metrics. Signed-off-by: Eduardo J. Ortega U <[email protected]> * A few fixes: 1. Rebuild some files with `make proto`. 2. Protect against nil ExecuteOptions on the tabletserver. Signed-off-by: Eduardo J. Ortega U <[email protected]> * Fix flags e2e test Signed-off-by: Eduardo J. Ortega U <[email protected]> * Address PR comments Signed-off-by: Eduardo J. Ortega U <[email protected]> * Fixes Signed-off-by: Eduardo J. Ortega U <[email protected]> * Fix a comment Signed-off-by: Eduardo J. Ortega U <[email protected]> * Fix e2e flag test Signed-off-by: Eduardo J. Ortega U <[email protected]> * Update JS code for protobuf changes. Signed-off-by: Eduardo J. Ortega U <[email protected]> * Fix QueryEngine unit test Signed-off-by: Eduardo J. Ortega U <[email protected]> * Fix e2e flag test Signed-off-by: Eduardo J. Ortega U <[email protected]> * Fix spurious tab in comment Signed-off-by: Eduardo J. Ortega U <[email protected]> * Address PR comment Don't use dual format flag for new flags - stick with - separated ones. Signed-off-by: Eduardo J. Ortega U <[email protected]> --------- Signed-off-by: Eduardo J. Ortega U <[email protected]>
slackhq · Apr 16, 2024 · 7046003 · 7046003
1 parent 02af9bc
commit 7046003
Show file tree

Hide file tree

Showing 16 changed files with 1,038 additions and 802 deletions.
diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt
@@ -93,6 +93,7 @@ Usage of vttablet:
       --enable-consolidator                                              Synonym to -enable_consolidator (default true)
       --enable-consolidator-replicas                                     Synonym to -enable_consolidator_replicas
       --enable-lag-throttler                                             Synonym to -enable_lag_throttler
+      --enable-per-workload-table-metrics                                If true, query counts and query error metrics include a label that identifies the workload
       --enable-tx-throttler                                              Synonym to -enable_tx_throttler
       --enable_consolidator                                              This option enables the query consolidator. (default true)
       --enable_consolidator_replicas                                     This option enables the query consolidator only on replicas.

diff --git a/go/vt/proto/query/query.pb.go b/go/vt/proto/query/query.pb.go
diff --git a/go/vt/proto/query/query_vtproto.pb.go b/go/vt/proto/query/query_vtproto.pb.go
diff --git a/go/vt/sqlparser/comments.go b/go/vt/sqlparser/comments.go
@@ -44,6 +44,8 @@ const (
 	DirectiveQueryPlanner = "PLANNER"
 	// DirectiveVtexplainRunDMLQueries tells explain format = vtexplain that it is okay to also run the query.
 	DirectiveVtexplainRunDMLQueries = "EXECUTE_DML_QUERIES"
+	// DirectiveWorkloadName specifies the name of the client application workload issuing the query.
+	DirectiveWorkloadName = "WORKLOAD_NAME"
 )
 
 func isNonSpace(r rune) bool {
@@ -378,3 +380,19 @@ func AllowScatterDirective(stmt Statement) bool {
 	}
 	return comments != nil && comments.Directives().IsSet(DirectiveAllowScatter)
 }
+
+// GetWorkloadNameFromStatement gets the workload name from the provided Statement, using workloadLabel as the name of
+// the query directive that specifies it.
+func GetWorkloadNameFromStatement(statement Statement) string {
+	commentedStatement, ok := statement.(Commented)
+	// This would mean that the statement lacks comments, so we can't obtain the workload from it. Hence default to
+	// empty workload name
+	if !ok {
+		return ""
+	}
+
+	directives := commentedStatement.GetParsedComments().Directives()
+	workloadName, _ := directives.GetString(DirectiveWorkloadName, "")
+
+	return workloadName
+}
diff --git a/go/vt/vtgate/engine/fake_vcursor_test.go b/go/vt/vtgate/engine/fake_vcursor_test.go
@@ -248,6 +248,10 @@ func (t *noopVCursor) SetWorkload(querypb.ExecuteOptions_Workload) {
 	panic("implement me")
 }
 
+func (t *noopVCursor) SetWorkloadName(string) {
+	panic("implement me")
+}
+
 func (t *noopVCursor) SetPlannerVersion(querypb.ExecuteOptions_PlannerVersion) {
 	panic("implement me")
 }
@@ -669,6 +673,10 @@ func (f *loggingVCursor) SetWorkload(querypb.ExecuteOptions_Workload) {
 	panic("implement me")
 }
 
+func (f *loggingVCursor) SetWorkloadName(string) {
+	panic("implement me")
+}
+
 func (f *loggingVCursor) SetPlannerVersion(querypb.ExecuteOptions_PlannerVersion) {
 	panic("implement me")
 }

diff --git a/go/vt/vtgate/engine/primitive.go b/go/vt/vtgate/engine/primitive.go
@@ -144,6 +144,7 @@ type (
 		SetTransactionMode(vtgatepb.TransactionMode)
 		SetWorkload(querypb.ExecuteOptions_Workload)
 		SetPlannerVersion(querypb.ExecuteOptions_PlannerVersion)
+		SetWorkloadName(string)
 		SetFoundRows(uint64)
 
 		SetDDLStrategy(string)

diff --git a/go/vt/vtgate/executor.go b/go/vt/vtgate/executor.go
@@ -985,6 +985,7 @@ func (e *Executor) getPlan(ctx context.Context, vcursor *vcursorImpl, sql string
 	}
 	ignoreMaxMemoryRows := sqlparser.IgnoreMaxMaxMemoryRowsDirective(stmt)
 	vcursor.SetIgnoreMaxMemoryRows(ignoreMaxMemoryRows)
+	vcursor.SetWorkloadName(sqlparser.GetWorkloadNameFromStatement(stmt))
 
 	setVarComment, err := prepareSetVarComment(vcursor, stmt)
 	if err != nil {

diff --git a/go/vt/vtgate/vcursor_impl.go b/go/vt/vtgate/vcursor_impl.go
@@ -786,6 +786,12 @@ func (vc *vcursorImpl) SetPlannerVersion(v plancontext.PlannerVersion) {
 	vc.safeSession.GetOrCreateOptions().PlannerVersion = v
 }
 
+func (vc *vcursorImpl) SetWorkloadName(workloadName string) {
+	if workloadName != "" {
+		vc.safeSession.GetOrCreateOptions().WorkloadName = workloadName
+	}
+}
+
 // SetFoundRows implements the SessionActions interface
 func (vc *vcursorImpl) SetFoundRows(foundRows uint64) {
 	vc.safeSession.FoundRows = foundRows

diff --git a/go/vt/vttablet/tabletserver/query_engine.go b/go/vt/vttablet/tabletserver/query_engine.go
@@ -173,6 +173,9 @@ type QueryEngine struct {
 	// stats
 	queryCounts, queryTimes, queryRowCounts, queryErrorCounts, queryRowsAffected, queryRowsReturned *stats.CountersWithMultiLabels
 
+	// stats flags
+	enablePerWorkloadTableMetrics bool
+
 	// Loggers
 	accessCheckerLogger *logutil.ThrottledLogger
 }
@@ -189,11 +192,12 @@ func NewQueryEngine(env tabletenv.Env, se *schema.Engine) *QueryEngine {
 	}
 
 	qe := &QueryEngine{
-		env:              env,
-		se:               se,
-		tables:           make(map[string]*schema.Table),
-		plans:            cache.NewDefaultCacheImpl(cacheCfg),
-		queryRuleSources: rules.NewMap(),
+		env:                           env,
+		se:                            se,
+		tables:                        make(map[string]*schema.Table),
+		plans:                         cache.NewDefaultCacheImpl(cacheCfg),
+		queryRuleSources:              rules.NewMap(),
+		enablePerWorkloadTableMetrics: config.EnablePerWorkloadTableMetrics,
 	}
 
 	qe.conns = connpool.NewPool(env, "ConnPool", config.OltpReadPool)
@@ -246,12 +250,19 @@ func NewQueryEngine(env tabletenv.Env, se *schema.Engine) *QueryEngine {
 	env.Exporter().NewGaugeFunc("QueryCacheSize", "Query engine query cache size", qe.plans.UsedCapacity)
 	env.Exporter().NewGaugeFunc("QueryCacheCapacity", "Query engine query cache capacity", qe.plans.MaxCapacity)
 	env.Exporter().NewCounterFunc("QueryCacheEvictions", "Query engine query cache evictions", qe.plans.Evictions)
-	qe.queryCounts = env.Exporter().NewCountersWithMultiLabels("QueryCounts", "query counts", []string{"Table", "Plan"})
-	qe.queryTimes = env.Exporter().NewCountersWithMultiLabels("QueryTimesNs", "query times in ns", []string{"Table", "Plan"})
-	qe.queryRowCounts = env.Exporter().NewCountersWithMultiLabels("QueryRowCounts", "(DEPRECATED - use QueryRowsAffected and QueryRowsReturned instead) query row counts", []string{"Table", "Plan"})
-	qe.queryRowsAffected = env.Exporter().NewCountersWithMultiLabels("QueryRowsAffected", "query rows affected", []string{"Table", "Plan"})
-	qe.queryRowsReturned = env.Exporter().NewCountersWithMultiLabels("QueryRowsReturned", "query rows returned", []string{"Table", "Plan"})
-	qe.queryErrorCounts = env.Exporter().NewCountersWithMultiLabels("QueryErrorCounts", "query error counts", []string{"Table", "Plan"})
+
+	labels := []string{"Table", "Plan"}
+	if config.EnablePerWorkloadTableMetrics {
+		labels = []string{"Table", "Plan", "Workload"}
+	}
+
+	qe.queryCounts = env.Exporter().NewCountersWithMultiLabels("QueryCounts", "query counts", labels)
+	qe.queryTimes = env.Exporter().NewCountersWithMultiLabels("QueryTimesNs", "query times in ns", labels)
+	qe.queryRowCounts = env.Exporter().NewCountersWithMultiLabels("QueryRowCounts", "(DEPRECATED - use QueryRowsAffected and QueryRowsReturned instead) query row counts", labels)
+
+	qe.queryRowsAffected = env.Exporter().NewCountersWithMultiLabels("QueryRowsAffected", "query rows affected", labels)
+	qe.queryRowsReturned = env.Exporter().NewCountersWithMultiLabels("QueryRowsReturned", "query rows returned", labels)
+	qe.queryErrorCounts = env.Exporter().NewCountersWithMultiLabels("QueryErrorCounts", "query error counts", labels)
 
 	env.Exporter().HandleFunc("/debug/hotrows", qe.txSerializer.ServeHTTP)
 	env.Exporter().HandleFunc("/debug/tablet_plans", qe.handleHTTPQueryPlans)
@@ -479,9 +490,13 @@ func (qe *QueryEngine) QueryPlanCacheLen() int {
 }
 
 // AddStats adds the given stats for the planName.tableName
-func (qe *QueryEngine) AddStats(planType planbuilder.PlanType, tableName string, queryCount int64, duration, mysqlTime time.Duration, rowsAffected, rowsReturned, errorCount int64) {
+func (qe *QueryEngine) AddStats(planType planbuilder.PlanType, tableName, workload string, queryCount int64, duration, mysqlTime time.Duration, rowsAffected, rowsReturned, errorCount int64) {
 	// table names can contain "." characters, replace them!
 	keys := []string{tableName, planType.String()}
+	// Only use the workload as a label if that's enabled in the configuration.
+	if qe.enablePerWorkloadTableMetrics {
+		keys = append(keys, workload)
+	}
 	qe.queryCounts.Add(keys, queryCount)
 	qe.queryTimes.Add(keys, int64(duration))
 	qe.queryRowCounts.Add(keys, rowsAffected)