Skip to content

Commit 7135f75

Browse files
committed
get and publish metrics
Signed-off-by: Alex Castilio dos Santos <[email protected]>
1 parent 8a62ddb commit 7135f75

File tree

6 files changed

+230
-30
lines changed

6 files changed

+230
-30
lines changed

go.mod

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@ require (
1515
sigs.k8s.io/cloud-provider-azure/pkg/azclient/configloader v0.0.20
1616
)
1717

18-
retract (
19-
v0.10.0 // published accidentally
20-
)
18+
retract v0.10.0 // published accidentally
2119

2220
require (
2321
cel.dev/expr v0.15.0 // indirect
@@ -333,6 +331,7 @@ require (
333331
k8s.io/apiextensions-apiserver v0.30.3
334332
k8s.io/cli-runtime v0.30.3
335333
k8s.io/kubectl v0.30.3
334+
k8s.io/metrics v0.30.3
336335
sigs.k8s.io/controller-runtime v0.18.5
337336
)
338337

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,6 +1200,8 @@ k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7F
12001200
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98=
12011201
k8s.io/kubectl v0.30.3 h1:YIBBvMdTW0xcDpmrOBzcpUVsn+zOgjMYIu7kAq+yqiI=
12021202
k8s.io/kubectl v0.30.3/go.mod h1:IcR0I9RN2+zzTRUa1BzZCm4oM0NLOawE6RzlDvd1Fpo=
1203+
k8s.io/metrics v0.30.3 h1:gKCpte5zykrOmQhZ8qmsxyJslMdiLN+sqbBfIWNpbGM=
1204+
k8s.io/metrics v0.30.3/go.mod h1:W06L2nXRhOwPkFYDJYWdEIS3u6JcJy3ebIPYbndRs6A=
12031205
k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ=
12041206
k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
12051207
oras.land/oras-go v1.2.5 h1:XpYuAwAb0DfQsunIyMfeET92emK8km3W4yEzZvUbsTo=
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
package scaletest
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"log"
7+
"sync"
8+
"time"
9+
10+
"github.com/microsoft/retina/pkg/telemetry"
11+
"github.com/microsoft/retina/test/e2e/common"
12+
"github.com/pkg/errors"
13+
"k8s.io/apimachinery/pkg/api/resource"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/labels"
16+
"k8s.io/client-go/kubernetes"
17+
"k8s.io/client-go/tools/clientcmd"
18+
metrics "k8s.io/metrics/pkg/client/clientset/versioned"
19+
)
20+
21+
type GetAndPublishMetrics struct {
22+
KubeConfigFilePath string
23+
AppInsightsKey string
24+
AdditionalTelemetryProperty map[string]string
25+
Labels map[string]string
26+
stop chan struct{}
27+
wg sync.WaitGroup
28+
telemetryClient *telemetry.TelemetryClient
29+
}
30+
31+
func (g *GetAndPublishMetrics) Run() error {
32+
telemetry.InitAppInsights(g.AppInsightsKey, g.AdditionalTelemetryProperty["retinaVersion"])
33+
34+
telemetryClient, err := telemetry.NewAppInsightsTelemetryClient("retina-scale-test", g.AdditionalTelemetryProperty)
35+
if err != nil {
36+
return errors.Wrap(err, "error creating telemetry client")
37+
}
38+
39+
g.telemetryClient = telemetryClient
40+
41+
g.stop = make(chan struct{})
42+
g.wg.Add(1)
43+
44+
go func() {
45+
46+
t := time.NewTicker(5 * time.Minute)
47+
48+
for {
49+
select {
50+
51+
case <-t.C:
52+
err = g.getAndPublishMetrics()
53+
if err != nil {
54+
log.Fatalf("error getting and publishing number of restarts: %v", err)
55+
return
56+
}
57+
58+
case <-g.stop:
59+
g.wg.Done()
60+
return
61+
62+
}
63+
}
64+
65+
}()
66+
67+
return nil
68+
}
69+
70+
func (g *GetAndPublishMetrics) Stop() error {
71+
telemetry.ShutdownAppInsights()
72+
close(g.stop)
73+
g.wg.Wait()
74+
return nil
75+
}
76+
77+
func (g *GetAndPublishMetrics) Prevalidate() error {
78+
if g.AppInsightsKey == "" {
79+
return fmt.Errorf("AppInsightsKey is required")
80+
}
81+
if _, ok := g.AdditionalTelemetryProperty["retinaVersion"]; !ok {
82+
return fmt.Errorf("retinaVersion is required in AdditionalTelemetryProperty")
83+
}
84+
return nil
85+
}
86+
87+
func (g *GetAndPublishMetrics) getAndPublishMetrics() error {
88+
// Get the number of restarts
89+
config, err := clientcmd.BuildConfigFromFlags("", g.KubeConfigFilePath)
90+
if err != nil {
91+
return fmt.Errorf("error building kubeconfig: %w", err)
92+
}
93+
94+
clientset, err := kubernetes.NewForConfig(config)
95+
if err != nil {
96+
return fmt.Errorf("error creating Kubernetes client: %w", err)
97+
}
98+
99+
mc, err := metrics.NewForConfig(config)
100+
if err != nil {
101+
return fmt.Errorf("error creating metrics client: %w", err)
102+
}
103+
104+
ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
105+
defer cancel()
106+
107+
metrics, err := g.getMetrics(ctx, clientset, mc)
108+
if err != nil {
109+
return fmt.Errorf("error getting metrics: %w", err)
110+
}
111+
112+
// Publish the number of restarts
113+
for _, metric := range metrics {
114+
g.telemetryClient.TrackEvent("scale-test", metric)
115+
}
116+
117+
return nil
118+
}
119+
120+
type metric map[string]string
121+
122+
func (g *GetAndPublishMetrics) getMetrics(ctx context.Context, k8sClient *kubernetes.Clientset, metricsClient *metrics.Clientset) ([]metric, error) {
123+
124+
labelSelector := labels.Set(g.Labels).String()
125+
126+
pods, err := k8sClient.CoreV1().Pods(common.KubeSystemNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
127+
if err != nil {
128+
return nil, errors.Wrap(err, "error getting nodes")
129+
}
130+
131+
nodesMetricsInt := metricsClient.MetricsV1beta1().NodeMetricses()
132+
podMetricsInt := metricsClient.MetricsV1beta1().PodMetricses(common.KubeSystemNamespace)
133+
134+
var allPodsHealth []metric
135+
136+
timestamp := time.Now().UTC().Format(time.RFC3339)
137+
138+
for _, pod := range pods.Items {
139+
var podHealth metric = make(map[string]string)
140+
141+
podMetrics, err := podMetricsInt.Get(ctx, pod.Name, metav1.GetOptions{})
142+
if err != nil {
143+
return nil, errors.Wrap(err, "error getting pod metrics")
144+
}
145+
146+
podMem := resource.MustParse("0")
147+
podCpu := resource.MustParse("0")
148+
for _, cm := range podMetrics.Containers {
149+
podMem.Add(cm.Usage["memory"])
150+
podCpu.Add(cm.Usage["cpu"])
151+
}
152+
153+
nodeMetrics, err := nodesMetricsInt.Get(ctx, pod.Spec.NodeName, metav1.GetOptions{})
154+
if err != nil {
155+
return nil, errors.Wrap(err, "error getting node metrics")
156+
}
157+
158+
nodeMem := nodeMetrics.Usage["memory"]
159+
nodeCpu := nodeMetrics.Usage["cpu"]
160+
161+
restarts := 0
162+
163+
for _, containerStatus := range pod.Status.ContainerStatuses {
164+
restarts = restarts + int(containerStatus.RestartCount)
165+
}
166+
167+
podHealth["timestamp"] = timestamp
168+
podHealth["pod"] = pod.Name
169+
podHealth["podCpuInMilliCore"] = fmt.Sprintf("%d", podCpu.MilliValue())
170+
podHealth["podMemoryInMB"] = fmt.Sprintf("%d", podMem.Value()/(1048576))
171+
podHealth["podRestarts"] = fmt.Sprintf("%d", restarts)
172+
podHealth["node"] = pod.Spec.NodeName
173+
podHealth["nodeCpuInMilliCore"] = fmt.Sprintf("%d", nodeCpu.MilliValue())
174+
podHealth["nodeMemoryInMB"] = fmt.Sprintf("%d", nodeMem.Value()/(1048576))
175+
176+
allPodsHealth = append(allPodsHealth, podHealth)
177+
178+
}
179+
180+
return allPodsHealth, nil
181+
}

test/e2e/framework/scaletest/options.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,6 @@ type Options struct {
3535
DeleteNetworkPoliciesTimes int
3636
numKwokPods int
3737
numRealPods int
38+
LabelsToGetMetrics map[string]string
39+
AdditionalTelemetryProperty map[string]string
3840
}

test/e2e/jobs/scale.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package retina
22

33
import (
4+
"os"
45
"time"
56

67
"github.com/microsoft/retina/test/e2e/framework/kubernetes"
@@ -37,6 +38,8 @@ func DefaultScaleTestOptions() scaletest.Options {
3738
DeleteNetworkPolicies: false,
3839
DeleteNetworkPoliciesInterval: 60 * time.Second,
3940
DeleteNetworkPoliciesTimes: 1,
41+
LabelsToGetMetrics: map[string]string{},
42+
AdditionalTelemetryProperty: map[string]string{},
4043
}
4144
}
4245

@@ -60,6 +63,12 @@ func ScaleTest(opt *scaletest.Options) *types.Job {
6063

6164
job.AddStep(&kubernetes.CreateNamespace{}, nil)
6265

66+
job.AddStep(&scaletest.GetAndPublishMetrics{
67+
Labels: opt.LabelsToGetMetrics,
68+
AppInsightsKey: os.Getenv("APP_INSIGHTS_KEY"),
69+
AdditionalTelemetryProperty: opt.AdditionalTelemetryProperty,
70+
}, &types.StepOptions{RunInBackgroundWithID: "get-metrics"})
71+
6372
job.AddStep(&scaletest.CreateResources{
6473
NumKwokDeployments: opt.NumKwokDeployments,
6574
NumKwokReplicas: opt.NumKwokReplicas,
@@ -95,15 +104,11 @@ func ScaleTest(opt *scaletest.Options) *types.Job {
95104
NumSharedLabelsPerPod: opt.NumSharedLabelsPerPod,
96105
}, nil)
97106

98-
// job.AddStep(&kubernetes.DeleteNamespace{}, nil)
99-
100-
// TODO: Add steps to get the state of the cluster
101-
102-
// job.AddStep(&kubernetes.GetDeployment{})
103-
104-
// job.AddStep(&kubernetes.GetDaemonSet{})
107+
job.AddStep(&types.Stop{
108+
BackgroundID: "get-metrics",
109+
}, nil)
105110

106-
// job.AddStep(&kubernetes.DescribePods{})
111+
job.AddStep(&kubernetes.DeleteNamespace{}, nil)
107112

108113
return job
109114
}

test/e2e/scale_test.go

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"time"
1515

1616
"github.com/microsoft/retina/test/e2e/common"
17+
"github.com/microsoft/retina/test/e2e/framework/generic"
1718
"github.com/microsoft/retina/test/e2e/framework/helpers"
1819
"github.com/microsoft/retina/test/e2e/framework/types"
1920
jobs "github.com/microsoft/retina/test/e2e/jobs"
@@ -69,25 +70,7 @@ func TestE2ERetina_Scale(t *testing.T) {
6970
chartPath := filepath.Join(rootDir, "deploy", "legacy", "manifests", "controller", "helm", "retina")
7071
kubeConfigFilePath := filepath.Join(rootDir, "test", "e2e", "test.pem")
7172

72-
// CreateTestInfra
73-
createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath, *createInfra))
74-
createTestInfra.Run(ctx)
75-
76-
t.Cleanup(func() {
77-
if *deleteInfra {
78-
_ = jobs.DeleteTestInfra(subID, rg, clusterName, location).Run()
79-
}
80-
})
81-
82-
// Install Retina
83-
installRetina := types.NewRunner(t, jobs.InstallRetina(kubeConfigFilePath, chartPath))
84-
installRetina.Run(ctx)
85-
86-
t.Cleanup(func() {
87-
_ = jobs.UninstallRetina(kubeConfigFilePath, chartPath).Run()
88-
})
89-
90-
// Scale test
73+
// Scale test parameters
9174
opt := jobs.DefaultScaleTestOptions()
9275
opt.KubeconfigPath = kubeConfigFilePath
9376

@@ -114,6 +97,34 @@ func TestE2ERetina_Scale(t *testing.T) {
11497
require.NoError(t, err)
11598
}
11699

100+
RetinaVersion := os.Getenv(generic.DefaultTagEnv)
101+
require.NotEmpty(t, RetinaVersion)
102+
opt.AdditionalTelemetryProperty["retinaVersion"] = RetinaVersion
103+
opt.AdditionalTelemetryProperty["clusterName"] = clusterName
104+
105+
// AppInsightsKey is required for telemetry
106+
require.NotEmpty(t, os.Getenv("APP_INSIGHTS_KEY"))
107+
108+
opt.LabelsToGetMetrics = map[string]string{"k8s-app": "retina"}
109+
110+
// CreateTestInfra
111+
createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, rg, clusterName, location, kubeConfigFilePath, *createInfra))
112+
createTestInfra.Run(ctx)
113+
114+
t.Cleanup(func() {
115+
if *deleteInfra {
116+
_ = jobs.DeleteTestInfra(subID, rg, clusterName, location).Run()
117+
}
118+
})
119+
120+
// Install Retina
121+
installRetina := types.NewRunner(t, jobs.InstallRetina(kubeConfigFilePath, chartPath))
122+
installRetina.Run(ctx)
123+
124+
t.Cleanup(func() {
125+
_ = jobs.UninstallRetina(kubeConfigFilePath, chartPath).Run()
126+
})
127+
117128
scale := types.NewRunner(t, jobs.ScaleTest(&opt))
118129
scale.Run(ctx)
119130
}

0 commit comments

Comments
 (0)