1+ import requests
2+ import flask
3+ import os
4+ from flask import Flask , Response
5+
6+ app = Flask (__name__ )
7+
8+ PROMETHEUS_URL = os .environ .get ('PROMETHEUS_URL' )
9+ if not PROMETHEUS_URL :
10+ raise ValueError ("PROMETHEUS_URL environment variable must be set" )
11+
12+ @app .route ("/metrics" )
13+ def metrics ():
14+ metrics_output = []
15+ gpu_uuids = []
16+
17+ try :
18+ response = requests .get (f"{ PROMETHEUS_URL } /api/v1/query" , params = {"query" : "nv_gpu_utilization" })
19+ if response .status_code == 200 :
20+ data = response .json ()
21+ if data ["data" ]["result" ]:
22+ gpu_uuids = [result ["metric" ].get ("gpu_uuid" , "GPU-unknown" ) for result in data ["data" ]["result" ]]
23+ except Exception as e :
24+ print ("Error getting GPU UUIDs: " + str (e ))
25+ gpu_uuids = ["GPU-unknown" ]
26+
27+ metric_queries = {
28+ "nv_pinned_memory_pool_used_bytes" : {
29+ "help" : "# HELP nv_pinned_memory_pool_used_bytes Pinned memory pool used in bytes" ,
30+ "type" : "# TYPE nv_pinned_memory_pool_used_bytes gauge"
31+ },
32+ "nv_gpu_utilization" : {
33+ "help" : "# HELP nv_gpu_utilization GPU utilization rate [0.0 - 1.0)" ,
34+ "type" : "# TYPE nv_gpu_utilization gauge"
35+ },
36+ "nv_gpu_memory_total_bytes" : {
37+ "help" : "# HELP nv_gpu_memory_total_bytes GPU total memory, in bytes" ,
38+ "type" : "# TYPE nv_gpu_memory_total_bytes gauge"
39+ },
40+ "nv_gpu_memory_used_bytes" : {
41+ "help" : "# HELP nv_gpu_memory_used_bytes GPU used memory, in bytes" ,
42+ "type" : "# TYPE nv_gpu_memory_used_bytes gauge"
43+ },
44+ "nv_gpu_power_usage" : {
45+ "help" : "# HELP nv_gpu_power_usage GPU power usage in watts" ,
46+ "type" : "# TYPE nv_gpu_power_usage gauge"
47+ },
48+ "nv_gpu_power_limit" : {
49+ "help" : "# HELP nv_gpu_power_limit GPU power management limit in watts" ,
50+ "type" : "# TYPE nv_gpu_power_limit gauge"
51+ }
52+ }
53+
54+ for metric_name , metric_info in metric_queries .items ():
55+ metrics_output .extend ([metric_info ["help" ], metric_info ["type" ]])
56+ try :
57+ response = requests .get (PROMETHEUS_URL + "/api/v1/query" , params = {"query" : metric_name })
58+ if response .status_code == 200 :
59+ data = response .json ()
60+ if data ["data" ]["result" ]:
61+ for result in data ["data" ]["result" ]:
62+ value = result ["value" ][1 ]
63+ gpu_uuid = result ["metric" ].get ("gpu_uuid" , "GPU-unknown" )
64+ metrics_output .append (metric_name + "{gpu_uuid=\" " + gpu_uuid + "\" } " + str (value ))
65+ else :
66+ for gpu_uuid in gpu_uuids :
67+ metrics_output .append (metric_name + "{gpu_uuid=\" " + gpu_uuid + "\" } 0" )
68+ except Exception as e :
69+ print ("Error querying " + metric_name + ": " + str (e ))
70+ for gpu_uuid in gpu_uuids :
71+ metrics_output .append (metric_name + "{gpu_uuid=\" " + gpu_uuid + "\" } 0" )
72+
73+ return Response ("\n " .join (metrics_output ), mimetype = "text/plain" )
74+
75+ @app .route ("/health" )
76+ def health ():
77+ return "Healthy"
0 commit comments