Add metrics collector for running perf_analyzer with GPU metrics (#52)

milescb · Miles Cochran-Branson · web-flow · commit 1ff95071b093 · 2025-03-03T11:43:14.000-05:00
* add metrics collector

* move serve.py to own file

* fix metadata to avoid ambiguity

* update default values.yaml

* add ingress to metrics-collector

* add documentation

---------

Co-authored-by: Miles Cochran-Branson &lt;miles.cb@cern.ch&gt;
diff --git a/docs/configuration-guide.rst b/docs/configuration-guide.rst
@@ -344,7 +344,43 @@ Additional optional parameters can control how quickly the autoscaler reacts to
        periodSeconds: 30
        stepsize: 1
 
-11.  (optional) Configure advanced monitoring 
+11. (optional) Configure Metrics Collector for running `perf_analyzer`
+=======================================================================
+
+To collect Prometheus metrics when using `perf_analyzer` for testing, a Metrics Collector can be deployed to format Prometheus metrics properly. The Metrics Collector is installed as a subchart with most of the default values pre-configured. To enable the Metrics Collector, set the `metricsCollector.enabled` parameter to `true` in your values file and configure ingress settings if needed as shown below:
+
+.. code-block:: yaml
+
+    metricsCollector:
+        enabled: true
+
+    ingress:
+        enabled: true
+        hostName: metrics-collector-atlas.nrp-nautilus.io
+        hosts:
+        - metrics-collector-atlas.nrp-nautilus.io
+        tls:
+        - hosts:
+            - metrics-collector-atlas.nrp-nautilus.io
+        ingressClassName: haproxy
+        annotations:
+            haproxy-ingress.github.io/cors-enable: "true"
+            haproxy-ingress.github.io/backend-protocol: "http"
+            haproxy-ingress.github.io/proxy-body-size: "512m"
+            haproxy-ingress.github.io/ssl-redirect: "true"
+            haproxy-ingress.github.io/secure-backends: "false"
+
+Running with `perf_analyzer` is then done with:
+
+.. code-block:: bash
+
+    perf_analyzer -m <model_name> -u <envoy_engress> -i grpc \
+        --collect-metrics --metrics-url <metrics_collector_endpoint>/metrics \
+        --verbose-csv -f <out_csv_file_name>.csv
+
+If ingress is not desired, port-forward the metrics collector service and call `--metrics-url localhost:8003/metrics` to access the metrics. 
+
+12.  (optional) Configure advanced monitoring 
 =============================================
 
 Refer to the `advanced monitoring guide <advanced-monitoring>`_.
diff --git a/helm/supersonic/cfg/serve.py b/helm/supersonic/cfg/serve.py
@@ -0,0 +1,77 @@
+import requests
+import flask
+import os
+from flask import Flask, Response
+
+app = Flask(__name__)
+
+PROMETHEUS_URL = os.environ.get('PROMETHEUS_URL')
+if not PROMETHEUS_URL:
+    raise ValueError("PROMETHEUS_URL environment variable must be set")
+
+@app.route("/metrics")
+def metrics():
+    metrics_output = []
+    gpu_uuids = []
+    
+    try:
+        response = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": "nv_gpu_utilization"})
+        if response.status_code == 200:
+            data = response.json()
+            if data["data"]["result"]:
+                gpu_uuids = [result["metric"].get("gpu_uuid", "GPU-unknown") for result in data["data"]["result"]]
+    except Exception as e:
+        print("Error getting GPU UUIDs: " + str(e))
+        gpu_uuids = ["GPU-unknown"]
+
+    metric_queries = {
+        "nv_pinned_memory_pool_used_bytes": {
+            "help": "# HELP nv_pinned_memory_pool_used_bytes Pinned memory pool used in bytes",
+            "type": "# TYPE nv_pinned_memory_pool_used_bytes gauge"
+        },
+        "nv_gpu_utilization": {
+            "help": "# HELP nv_gpu_utilization GPU utilization rate [0.0 - 1.0)",
+            "type": "# TYPE nv_gpu_utilization gauge"
+        },
+        "nv_gpu_memory_total_bytes": {
+            "help": "# HELP nv_gpu_memory_total_bytes GPU total memory, in bytes",
+            "type": "# TYPE nv_gpu_memory_total_bytes gauge"
+        },
+        "nv_gpu_memory_used_bytes": {
+            "help": "# HELP nv_gpu_memory_used_bytes GPU used memory, in bytes",
+            "type": "# TYPE nv_gpu_memory_used_bytes gauge"
+        },
+        "nv_gpu_power_usage": {
+            "help": "# HELP nv_gpu_power_usage GPU power usage in watts",
+            "type": "# TYPE nv_gpu_power_usage gauge"
+        },
+        "nv_gpu_power_limit": {
+            "help": "# HELP nv_gpu_power_limit GPU power management limit in watts",
+            "type": "# TYPE nv_gpu_power_limit gauge"
+        }
+    }
+
+    for metric_name, metric_info in metric_queries.items():
+        metrics_output.extend([metric_info["help"], metric_info["type"]])
+        try:
+            response = requests.get(PROMETHEUS_URL + "/api/v1/query", params={"query": metric_name})
+            if response.status_code == 200:
+                data = response.json()
+                if data["data"]["result"]:
+                    for result in data["data"]["result"]:
+                        value = result["value"][1]
+                        gpu_uuid = result["metric"].get("gpu_uuid", "GPU-unknown")
+                        metrics_output.append(metric_name + "{gpu_uuid=\"" + gpu_uuid + "\"} " + str(value))
+                else:
+                    for gpu_uuid in gpu_uuids:
+                        metrics_output.append(metric_name + "{gpu_uuid=\"" + gpu_uuid + "\"} 0")
+        except Exception as e:
+            print("Error querying " + metric_name + ": " + str(e))
+            for gpu_uuid in gpu_uuids:
+                metrics_output.append(metric_name + "{gpu_uuid=\"" + gpu_uuid + "\"} 0")
+
+    return Response("\n".join(metrics_output), mimetype="text/plain")
+
+@app.route("/health")
+def health():
+    return "Healthy"
diff --git a/helm/supersonic/templates/metrics-collector-configmaps.yaml b/helm/supersonic/templates/metrics-collector-configmaps.yaml
@@ -0,0 +1,18 @@
+{{- if .Values.metricsCollector.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "supersonic.name" . }}-metrics-collector-config
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ .Chart.Name }}
+    app.kubernetes.io/instance: {{ include "supersonic.name" . }}
+    app.kubernetes.io/component: metrics-collector
+data:
+  requirements.txt: |
+    flask
+    requests
+    gunicorn
+  serve.py: |-
+{{ (.Files.Get "cfg/serve.py") | indent 4 }}
+{{- end }}
diff --git a/helm/supersonic/templates/metrics-collector-deployment.yaml b/helm/supersonic/templates/metrics-collector-deployment.yaml
@@ -0,0 +1,51 @@
+{{- if .Values.metricsCollector.enabled }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "supersonic.name" . }}-metrics-collector
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ .Chart.Name }}
+    app.kubernetes.io/instance: {{ include "supersonic.name" . }}
+    app.kubernetes.io/component: metrics-collector
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: {{ .Chart.Name }}
+      app.kubernetes.io/instance: {{ include "supersonic.name" . }}
+      app.kubernetes.io/component: metrics-collector
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: {{ .Chart.Name }}
+        app.kubernetes.io/instance: {{ include "supersonic.name" . }}
+        app.kubernetes.io/component: metrics-collector
+    spec:
+      containers:
+      - name: metrics-collector
+        image: python:3.11-slim
+        command: ["/bin/sh", "-c"]
+        args:
+        - |
+          cd /app
+          pip install --no-cache-dir -r /app/requirements.txt
+          PYTHONPATH=/app PROMETHEUS_URL={{ include "supersonic.prometheusUrl" . | quote }} \
+          gunicorn --bind {{ .Values.metricsCollector.host | default "0.0.0.0" }}:{{ .Values.metricsCollector.port | default 8003 }} \
+            --workers 4 \
+            --timeout 120 \
+            --access-logfile - \
+            --error-logfile - \
+            "serve:app"
+        ports:
+        - containerPort: {{ .Values.metricsCollector.port | default 8003 }}
+        resources:
+            {{- toYaml .Values.metricsCollector.resources | nindent 10 }}
+        volumeMounts:
+        - name: config-volume
+          mountPath: /app
+      volumes:
+      - name: config-volume
+        configMap:
+          name: {{ include "supersonic.name" . }}-metrics-collector-config
+{{- end }}
diff --git a/helm/supersonic/templates/metrics-collector-ingress.yaml b/helm/supersonic/templates/metrics-collector-ingress.yaml
@@ -0,0 +1,48 @@
+{{- if and .Values.metricsCollector.ingress.enabled .Values.metricsCollector.ingress.hostName -}}
+  {{- $hostName := .Values.metricsCollector.ingress.hostName -}}
+  {{- $namespace := .Release.Namespace -}}
+  {{- $currentName := (include "supersonic.name" .) -}}
+  {{- $existingIngresses := (lookup "networking.k8s.io/v1" "Ingress" $namespace "").items -}}
+  {{- range $ingress := $existingIngresses -}}
+    {{- if not (hasPrefix (printf "%s-" $currentName) $ingress.metadata.name) -}}
+      {{- range $ingress.spec.rules -}}
+        {{- if eq .host $hostName -}}
+          {{- fail (printf "Error: Ingress host %q is already in use by ingress %q in namespace %q" $hostName $ingress.metadata.name $namespace) -}}
+        {{- end -}}
+      {{- end -}}
+    {{- end -}}
+  {{- end -}}
+{{- end }}
+
+{{ if .Values.metricsCollector.ingress.enabled | default false }}
+
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "supersonic.name" . }}-metrics-collector-ingress
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ .Chart.Name }}
+    app.kubernetes.io/instance: {{ include "supersonic.name" . }}
+    app.kubernetes.io/component: metrics-collector
+  annotations:
+    {{- if .Values.metricsCollector.ingress.annotations }}
+{{ toYaml .Values.metricsCollector.ingress.annotations | nindent 4 }}
+    {{- end }}
+spec:
+  ingressClassName: {{ .Values.metricsCollector.ingress.ingressClassName }}
+  tls:
+  - hosts:
+      - {{ .Values.metricsCollector.ingress.hostName }}
+  rules:
+  - host: {{ .Values.metricsCollector.ingress.hostName }}
+    http:
+      paths:
+      - path: /
+        pathType: ImplementationSpecific
+        backend:
+          service:
+            name: {{ include "supersonic.name" . }}-metrics-collector
+            port:
+              number: {{ .Values.metricsCollector.service.port | default 8003 }}
+{{ end }}
diff --git a/helm/supersonic/templates/metrics-collector-service.yaml b/helm/supersonic/templates/metrics-collector-service.yaml
@@ -0,0 +1,20 @@
+{{- if .Values.metricsCollector.enabled }}
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "supersonic.name" . }}-metrics-collector
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ .Chart.Name }}
+    app.kubernetes.io/instance: {{ include "supersonic.name" . }}
+    app.kubernetes.io/component: metrics-collector
+spec:
+  selector:
+    app.kubernetes.io/name: {{ .Chart.Name }}
+    app.kubernetes.io/instance: {{ include "supersonic.name" . }}
+    app.kubernetes.io/component: metrics-collector
+  ports:
+  - port: 8003
+    targetPort: 8003
+  type: ClusterIP
+{{- end }}
diff --git a/helm/supersonic/values.yaml b/helm/supersonic/values.yaml
@@ -490,3 +490,34 @@ opentelemetry-collector:
           receivers: [spanmetrics]
           processors: [batch]
           exporters: [prometheusremotewrite]
+
+metricsCollector:
+  # -- Enable metrics collector
+  enabled: false
+
+  # -- Host and port for metrics collector
+  host: "0.0.0.0"
+  port: 8003
+
+  # -- Resource limits and requests for metrics collector
+  resources:
+    limits:
+      cpu: 1
+      memory: 1Gi
+    requests:
+      cpu: 500m
+      memory: 512Mi
+
+  # -- Add service
+  service:
+    port: 8003
+    type: ClusterIP
+
+
+  # -- Ingress configuration for metrics collector
+  ingress:
+    enabled: false
+    hostName: ""
+    ingressClassName: ""
+    annotations: {}
+
diff --git a/values/values-nautilus-atlas.yaml b/values/values-nautilus-atlas.yaml
@@ -30,7 +30,7 @@ triton:
   resources:
     limits: { nvidia.com/gpu: 1, cpu: 2, memory: 16G }
     requests: { nvidia.com/gpu: 1, cpu: 2, memory: 16G }
-  replicas: 4
+  replicas: 1
   modelRepository:
     enabled: true
     storageType: cvmfs-pvc
@@ -96,3 +96,22 @@ grafana:
   grafana.ini:
     server:
       root_url: https://grafana-atlas.nrp-nautilus.io
+
+metricsCollector:
+  enabled: true
+
+  ingress:
+    enabled: true
+    hostName: metrics-collector-atlas.nrp-nautilus.io
+    hosts:
+      - metrics-collector-atlas.nrp-nautilus.io
+    tls:
+      - hosts:
+          - metrics-collector-atlas.nrp-nautilus.io
+    ingressClassName: haproxy
+    annotations:
+      haproxy-ingress.github.io/cors-enable: "true"
+      haproxy-ingress.github.io/backend-protocol: "http"
+      haproxy-ingress.github.io/proxy-body-size: "512m"
+      haproxy-ingress.github.io/ssl-redirect: "true"
+      haproxy-ingress.github.io/secure-backends: "false"