Skip to content

Commit 1ff9507

Browse files
milescbMiles Cochran-Branson
andauthored
Add metrics collector for running perf_analyzer with GPU metrics (#52)
* add metrics collector * move serve.py to own file * fix metadata to avoid ambiguity * update default values.yaml * add ingress to metrics-collector * add documentation --------- Co-authored-by: Miles Cochran-Branson <[email protected]>
1 parent 7bfd634 commit 1ff9507

File tree

8 files changed

+302
-2
lines changed

8 files changed

+302
-2
lines changed

docs/configuration-guide.rst

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,43 @@ Additional optional parameters can control how quickly the autoscaler reacts to
344344
periodSeconds: 30
345345
stepsize: 1
346346
347-
11. (optional) Configure advanced monitoring
347+
11. (optional) Configure Metrics Collector for running `perf_analyzer`
348+
=======================================================================
349+
350+
To collect Prometheus metrics when using `perf_analyzer` for testing, a Metrics Collector can be deployed to format Prometheus metrics properly. The Metrics Collector is installed as a subchart with most of the default values pre-configured. To enable the Metrics Collector, set the `metricsCollector.enabled` parameter to `true` in your values file and configure ingress settings if needed as shown below:
351+
352+
.. code-block:: yaml
353+
354+
metricsCollector:
355+
enabled: true
356+
357+
ingress:
358+
enabled: true
359+
hostName: metrics-collector-atlas.nrp-nautilus.io
360+
hosts:
361+
- metrics-collector-atlas.nrp-nautilus.io
362+
tls:
363+
- hosts:
364+
- metrics-collector-atlas.nrp-nautilus.io
365+
ingressClassName: haproxy
366+
annotations:
367+
haproxy-ingress.github.io/cors-enable: "true"
368+
haproxy-ingress.github.io/backend-protocol: "http"
369+
haproxy-ingress.github.io/proxy-body-size: "512m"
370+
haproxy-ingress.github.io/ssl-redirect: "true"
371+
haproxy-ingress.github.io/secure-backends: "false"
372+
373+
Running with `perf_analyzer` is then done with:
374+
375+
.. code-block:: bash
376+
377+
perf_analyzer -m <model_name> -u <envoy_engress> -i grpc \
378+
--collect-metrics --metrics-url <metrics_collector_endpoint>/metrics \
379+
--verbose-csv -f <out_csv_file_name>.csv
380+
381+
If ingress is not desired, port-forward the metrics collector service and call `--metrics-url localhost:8003/metrics` to access the metrics.
382+
383+
12. (optional) Configure advanced monitoring
348384
=============================================
349385

350386
Refer to the `advanced monitoring guide <advanced-monitoring>`_.

helm/supersonic/cfg/serve.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import requests
2+
import flask
3+
import os
4+
from flask import Flask, Response
5+
6+
app = Flask(__name__)
7+
8+
PROMETHEUS_URL = os.environ.get('PROMETHEUS_URL')
9+
if not PROMETHEUS_URL:
10+
raise ValueError("PROMETHEUS_URL environment variable must be set")
11+
12+
@app.route("/metrics")
13+
def metrics():
14+
metrics_output = []
15+
gpu_uuids = []
16+
17+
try:
18+
response = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": "nv_gpu_utilization"})
19+
if response.status_code == 200:
20+
data = response.json()
21+
if data["data"]["result"]:
22+
gpu_uuids = [result["metric"].get("gpu_uuid", "GPU-unknown") for result in data["data"]["result"]]
23+
except Exception as e:
24+
print("Error getting GPU UUIDs: " + str(e))
25+
gpu_uuids = ["GPU-unknown"]
26+
27+
metric_queries = {
28+
"nv_pinned_memory_pool_used_bytes": {
29+
"help": "# HELP nv_pinned_memory_pool_used_bytes Pinned memory pool used in bytes",
30+
"type": "# TYPE nv_pinned_memory_pool_used_bytes gauge"
31+
},
32+
"nv_gpu_utilization": {
33+
"help": "# HELP nv_gpu_utilization GPU utilization rate [0.0 - 1.0)",
34+
"type": "# TYPE nv_gpu_utilization gauge"
35+
},
36+
"nv_gpu_memory_total_bytes": {
37+
"help": "# HELP nv_gpu_memory_total_bytes GPU total memory, in bytes",
38+
"type": "# TYPE nv_gpu_memory_total_bytes gauge"
39+
},
40+
"nv_gpu_memory_used_bytes": {
41+
"help": "# HELP nv_gpu_memory_used_bytes GPU used memory, in bytes",
42+
"type": "# TYPE nv_gpu_memory_used_bytes gauge"
43+
},
44+
"nv_gpu_power_usage": {
45+
"help": "# HELP nv_gpu_power_usage GPU power usage in watts",
46+
"type": "# TYPE nv_gpu_power_usage gauge"
47+
},
48+
"nv_gpu_power_limit": {
49+
"help": "# HELP nv_gpu_power_limit GPU power management limit in watts",
50+
"type": "# TYPE nv_gpu_power_limit gauge"
51+
}
52+
}
53+
54+
for metric_name, metric_info in metric_queries.items():
55+
metrics_output.extend([metric_info["help"], metric_info["type"]])
56+
try:
57+
response = requests.get(PROMETHEUS_URL + "/api/v1/query", params={"query": metric_name})
58+
if response.status_code == 200:
59+
data = response.json()
60+
if data["data"]["result"]:
61+
for result in data["data"]["result"]:
62+
value = result["value"][1]
63+
gpu_uuid = result["metric"].get("gpu_uuid", "GPU-unknown")
64+
metrics_output.append(metric_name + "{gpu_uuid=\"" + gpu_uuid + "\"} " + str(value))
65+
else:
66+
for gpu_uuid in gpu_uuids:
67+
metrics_output.append(metric_name + "{gpu_uuid=\"" + gpu_uuid + "\"} 0")
68+
except Exception as e:
69+
print("Error querying " + metric_name + ": " + str(e))
70+
for gpu_uuid in gpu_uuids:
71+
metrics_output.append(metric_name + "{gpu_uuid=\"" + gpu_uuid + "\"} 0")
72+
73+
return Response("\n".join(metrics_output), mimetype="text/plain")
74+
75+
@app.route("/health")
76+
def health():
77+
return "Healthy"
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{{- if .Values.metricsCollector.enabled }}
2+
apiVersion: v1
3+
kind: ConfigMap
4+
metadata:
5+
name: {{ include "supersonic.name" . }}-metrics-collector-config
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
app.kubernetes.io/name: {{ .Chart.Name }}
9+
app.kubernetes.io/instance: {{ include "supersonic.name" . }}
10+
app.kubernetes.io/component: metrics-collector
11+
data:
12+
requirements.txt: |
13+
flask
14+
requests
15+
gunicorn
16+
serve.py: |-
17+
{{ (.Files.Get "cfg/serve.py") | indent 4 }}
18+
{{- end }}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{{- if .Values.metricsCollector.enabled }}
2+
apiVersion: apps/v1
3+
kind: Deployment
4+
metadata:
5+
name: {{ include "supersonic.name" . }}-metrics-collector
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
app.kubernetes.io/name: {{ .Chart.Name }}
9+
app.kubernetes.io/instance: {{ include "supersonic.name" . }}
10+
app.kubernetes.io/component: metrics-collector
11+
spec:
12+
replicas: 1
13+
selector:
14+
matchLabels:
15+
app.kubernetes.io/name: {{ .Chart.Name }}
16+
app.kubernetes.io/instance: {{ include "supersonic.name" . }}
17+
app.kubernetes.io/component: metrics-collector
18+
template:
19+
metadata:
20+
labels:
21+
app.kubernetes.io/name: {{ .Chart.Name }}
22+
app.kubernetes.io/instance: {{ include "supersonic.name" . }}
23+
app.kubernetes.io/component: metrics-collector
24+
spec:
25+
containers:
26+
- name: metrics-collector
27+
image: python:3.11-slim
28+
command: ["/bin/sh", "-c"]
29+
args:
30+
- |
31+
cd /app
32+
pip install --no-cache-dir -r /app/requirements.txt
33+
PYTHONPATH=/app PROMETHEUS_URL={{ include "supersonic.prometheusUrl" . | quote }} \
34+
gunicorn --bind {{ .Values.metricsCollector.host | default "0.0.0.0" }}:{{ .Values.metricsCollector.port | default 8003 }} \
35+
--workers 4 \
36+
--timeout 120 \
37+
--access-logfile - \
38+
--error-logfile - \
39+
"serve:app"
40+
ports:
41+
- containerPort: {{ .Values.metricsCollector.port | default 8003 }}
42+
resources:
43+
{{- toYaml .Values.metricsCollector.resources | nindent 10 }}
44+
volumeMounts:
45+
- name: config-volume
46+
mountPath: /app
47+
volumes:
48+
- name: config-volume
49+
configMap:
50+
name: {{ include "supersonic.name" . }}-metrics-collector-config
51+
{{- end }}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
{{- if and .Values.metricsCollector.ingress.enabled .Values.metricsCollector.ingress.hostName -}}
2+
{{- $hostName := .Values.metricsCollector.ingress.hostName -}}
3+
{{- $namespace := .Release.Namespace -}}
4+
{{- $currentName := (include "supersonic.name" .) -}}
5+
{{- $existingIngresses := (lookup "networking.k8s.io/v1" "Ingress" $namespace "").items -}}
6+
{{- range $ingress := $existingIngresses -}}
7+
{{- if not (hasPrefix (printf "%s-" $currentName) $ingress.metadata.name) -}}
8+
{{- range $ingress.spec.rules -}}
9+
{{- if eq .host $hostName -}}
10+
{{- fail (printf "Error: Ingress host %q is already in use by ingress %q in namespace %q" $hostName $ingress.metadata.name $namespace) -}}
11+
{{- end -}}
12+
{{- end -}}
13+
{{- end -}}
14+
{{- end -}}
15+
{{- end }}
16+
17+
{{ if .Values.metricsCollector.ingress.enabled | default false }}
18+
19+
apiVersion: networking.k8s.io/v1
20+
kind: Ingress
21+
metadata:
22+
name: {{ include "supersonic.name" . }}-metrics-collector-ingress
23+
namespace: {{ .Release.Namespace }}
24+
labels:
25+
app.kubernetes.io/name: {{ .Chart.Name }}
26+
app.kubernetes.io/instance: {{ include "supersonic.name" . }}
27+
app.kubernetes.io/component: metrics-collector
28+
annotations:
29+
{{- if .Values.metricsCollector.ingress.annotations }}
30+
{{ toYaml .Values.metricsCollector.ingress.annotations | nindent 4 }}
31+
{{- end }}
32+
spec:
33+
ingressClassName: {{ .Values.metricsCollector.ingress.ingressClassName }}
34+
tls:
35+
- hosts:
36+
- {{ .Values.metricsCollector.ingress.hostName }}
37+
rules:
38+
- host: {{ .Values.metricsCollector.ingress.hostName }}
39+
http:
40+
paths:
41+
- path: /
42+
pathType: ImplementationSpecific
43+
backend:
44+
service:
45+
name: {{ include "supersonic.name" . }}-metrics-collector
46+
port:
47+
number: {{ .Values.metricsCollector.service.port | default 8003 }}
48+
{{ end }}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{{- if .Values.metricsCollector.enabled }}
2+
apiVersion: v1
3+
kind: Service
4+
metadata:
5+
name: {{ include "supersonic.name" . }}-metrics-collector
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
app.kubernetes.io/name: {{ .Chart.Name }}
9+
app.kubernetes.io/instance: {{ include "supersonic.name" . }}
10+
app.kubernetes.io/component: metrics-collector
11+
spec:
12+
selector:
13+
app.kubernetes.io/name: {{ .Chart.Name }}
14+
app.kubernetes.io/instance: {{ include "supersonic.name" . }}
15+
app.kubernetes.io/component: metrics-collector
16+
ports:
17+
- port: 8003
18+
targetPort: 8003
19+
type: ClusterIP
20+
{{- end }}

helm/supersonic/values.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,3 +490,34 @@ opentelemetry-collector:
490490
receivers: [spanmetrics]
491491
processors: [batch]
492492
exporters: [prometheusremotewrite]
493+
494+
metricsCollector:
495+
# -- Enable metrics collector
496+
enabled: false
497+
498+
# -- Host and port for metrics collector
499+
host: "0.0.0.0"
500+
port: 8003
501+
502+
# -- Resource limits and requests for metrics collector
503+
resources:
504+
limits:
505+
cpu: 1
506+
memory: 1Gi
507+
requests:
508+
cpu: 500m
509+
memory: 512Mi
510+
511+
# -- Add service
512+
service:
513+
port: 8003
514+
type: ClusterIP
515+
516+
517+
# -- Ingress configuration for metrics collector
518+
ingress:
519+
enabled: false
520+
hostName: ""
521+
ingressClassName: ""
522+
annotations: {}
523+

values/values-nautilus-atlas.yaml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ triton:
3030
resources:
3131
limits: { nvidia.com/gpu: 1, cpu: 2, memory: 16G }
3232
requests: { nvidia.com/gpu: 1, cpu: 2, memory: 16G }
33-
replicas: 4
33+
replicas: 1
3434
modelRepository:
3535
enabled: true
3636
storageType: cvmfs-pvc
@@ -96,3 +96,22 @@ grafana:
9696
grafana.ini:
9797
server:
9898
root_url: https://grafana-atlas.nrp-nautilus.io
99+
100+
metricsCollector:
101+
enabled: true
102+
103+
ingress:
104+
enabled: true
105+
hostName: metrics-collector-atlas.nrp-nautilus.io
106+
hosts:
107+
- metrics-collector-atlas.nrp-nautilus.io
108+
tls:
109+
- hosts:
110+
- metrics-collector-atlas.nrp-nautilus.io
111+
ingressClassName: haproxy
112+
annotations:
113+
haproxy-ingress.github.io/cors-enable: "true"
114+
haproxy-ingress.github.io/backend-protocol: "http"
115+
haproxy-ingress.github.io/proxy-body-size: "512m"
116+
haproxy-ingress.github.io/ssl-redirect: "true"
117+
haproxy-ingress.github.io/secure-backends: "false"

0 commit comments

Comments
 (0)