Skip to content

Upgrade TMA metrics for GNR #388

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Jun 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cmd/metrics/event_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,21 @@ func LoadEventGroups(eventDefinitionOverridePath string, metadata Metadata) (gro
defer file.Close()
scanner := bufio.NewScanner(file)
uncollectable := mapset.NewSet[string]()
if flagTransactionRate == 0 {
uncollectable.Add("TXN")
}
var group GroupDefinition
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if len(line) == 0 || line[0] == '#' {
continue
}
// strip end of line comment
if idx := strings.Index(line, "#"); idx != -1 {
line = line[:idx]
}
// remove trailing spaces
line = strings.TrimSpace(line)
var event EventDefinition
if event, err = parseEventDefinition(line[:len(line)-1]); err != nil {
return
Expand Down
84 changes: 79 additions & 5 deletions cmd/metrics/metric_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"log/slog"
"os"
"path/filepath"
"regexp"
"strings"

"github.com/Knetic/govaluate"
Expand Down Expand Up @@ -92,22 +93,25 @@ func ConfigureMetrics(loadedMetrics []MetricDefinition, uncollectableEvents []st
err = fmt.Errorf("unknown granularity: %s", flagGranularity)
return
}

coresPerSocket := fmt.Sprintf("%f", float64(metadata.CoresPerSocket))
chasPerSocket := fmt.Sprintf("%f", float64(len(metadata.UncoreDeviceIDs["cha"])))
socketCount := fmt.Sprintf("%f", float64(metadata.SocketCount))
hyperThreadingOn := fmt.Sprintf("%t", metadata.ThreadsPerCore > 1)
threadsPerCore := fmt.Sprintf("%f", float64(metadata.ThreadsPerCore))
// load retire latency constants
var retireLatencies map[string]string
if retireLatencies, err = LoadRetireLatencies(metadata); err != nil {
slog.Error("failed to load retire latencies", slog.String("error", err.Error()))
return
}
// configure each metric
reConstantInt := regexp.MustCompile(`\[(\d+)\]`)
for metricIdx := range loadedMetrics {
tmpMetric := loadedMetrics[metricIdx]
// abbreviate event names in metric expressions to match abbreviations used in uncollectableEvents
tmpMetric.Expression = abbreviateEventName(tmpMetric.Expression)
// skip metrics that use uncollectable events
foundUncollectable := false
if flagTransactionRate == 0 {
uncollectableEvents = append(uncollectableEvents, "TXN")
}
for _, uncollectableEvent := range uncollectableEvents {
if strings.Contains(tmpMetric.Expression, uncollectableEvent) {
slog.Debug("removing metric that uses uncollectable event", slog.String("metric", tmpMetric.Name), slog.String("event", uncollectableEvent))
Expand All @@ -123,8 +127,11 @@ func ConfigureMetrics(loadedMetrics []MetricDefinition, uncollectableEvents []st
if transformed, err = transformConditional(tmpMetric.Expression); err != nil {
return
}
// replace "> =" with ">=" and "< =" with "<="
transformed = strings.ReplaceAll(transformed, "> =", ">=")
transformed = strings.ReplaceAll(transformed, "< =", "<=")
if transformed != tmpMetric.Expression {
slog.Debug("transformed metric", slog.String("original", tmpMetric.Name), slog.String("transformed", transformed))
slog.Debug("transformed metric", slog.String("metric name", tmpMetric.Name), slog.String("transformed", transformed))
tmpMetric.Expression = transformed
}
// replace constants with their values
Expand All @@ -136,6 +143,24 @@ func ConfigureMetrics(loadedMetrics []MetricDefinition, uncollectableEvents []st
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[HYPERTHREADING_ON]", hyperThreadingOn)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CONST_THREAD_COUNT]", threadsPerCore)
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[TXN]", fmt.Sprintf("%f", flagTransactionRate))
// replace retire latencies
for retireEvent, retireLatency := range retireLatencies {
// replace <event>:retire_latency with value
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, fmt.Sprintf("[%s:retire_latency]", retireEvent), retireLatency)
}
// replace constant numbers masquerading as variables with their values, e.g., [20] -> 20
// there may be more than one with differing values in the expression, so use a regex to find them all
for {
// find the first match
found := reConstantInt.FindStringSubmatchIndex(tmpMetric.Expression)
if found == nil {
break // no more matches
}
// match[2] is the start of the number, match[3] is the end of the number
number := tmpMetric.Expression[found[2]:found[3]]
// replace the whole match with the number
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, tmpMetric.Expression[found[0]:found[1]], number)
}
// get a list of the variables in the expression
tmpMetric.Variables = make(map[string]int)
expressionIdx := 0
Expand Down Expand Up @@ -239,3 +264,52 @@ func transformConditional(origIn string) (out string, err error) {
}
return
}

type PlatformInfo struct {
ModelName string `json:"Model name"`
CPUFamily string `json:"CPU family"`
Model string `json:"Model"`
ThreadsPerCore string `json:"Thread(s) per core"`
CoresPerSocket string `json:"Core(s) per socket"`
Sockets string `json:"Socket(s)"`
Stepping string `json:"Stepping"`
L3Cache string `json:"L3 cache"`
NUMANodes string `json:"NUMA node(s)"`
TMAVersion string `json:"TMA version"`
}

type MetricStats struct {
Min float64 `json:"MIN"`
Max float64 `json:"MAX"`
Mean float64 `json:"MEAN"`
}

type RetireLatency struct {
Platform PlatformInfo `json:"Platform"`
Data map[string]MetricStats `json:"Data"`
}

func LoadRetireLatencies(metadata Metadata) (retireLatencies map[string]string, err error) {
uarch := strings.ToLower(strings.Split(metadata.Microarchitecture, "_")[0])
uarch = strings.Split(uarch, " ")[0]
filename := fmt.Sprintf("%s_retire_latency.json", uarch)
var bytes []byte
if bytes, err = resources.ReadFile(filepath.Join("resources", "metrics", metadata.Architecture, metadata.Vendor, filename)); err != nil {
// not all architectures have retire latencies defined
err = nil
return
}
var retireLatency RetireLatency
if err = json.Unmarshal(bytes, &retireLatency); err != nil {
slog.Error("failed to unmarshal retire latencies", slog.String("error", err.Error()))
return
}
// create a map of retire latencies
retireLatencies = make(map[string]string)
for event, stats := range retireLatency.Data {
// use the mean value for the retire latency
retireLatencies[event] = fmt.Sprintf("%f", stats.Mean)
}
slog.Debug("loaded retire latencies", slog.Any("latencies", retireLatencies))
return
}
Loading