4
4
from pydantic import BaseModel
5
5
import logging
6
6
import pynvml
7
+ import atexit
7
8
8
9
logger = logging .getLogger (__name__ )
9
10
10
11
11
- class GpuBaseInfo (BaseModel ):
12
+ class GPUBaseInfo (BaseModel ):
12
13
"""Model for general GPU information."""
13
14
14
15
id : str
@@ -17,88 +18,134 @@ class GpuBaseInfo(BaseModel):
17
18
memory_free : int
18
19
19
20
20
- class GpuComputeInfo ( GpuBaseInfo ):
21
+ class GPUComputeInfo ( GPUBaseInfo ):
21
22
"""Model for detailed GPU compute information."""
22
23
23
24
major : int
24
25
minor : int
25
26
26
27
27
- class GpuUtilizationInfo ( GpuBaseInfo ):
28
- """Model for real-time GPU utilization statistics."""
28
+ class GPUUtilizationInfo ( GPUBaseInfo ):
29
+ """Model for GPU utilization statistics."""
29
30
30
31
utilization_compute : int
31
32
utilization_memory : int
32
33
33
34
34
- class GpuInfo ( GpuComputeInfo , GpuUtilizationInfo ):
35
- """Model for full CUDA device information."""
35
+ class GPUInfo ( GPUComputeInfo , GPUUtilizationInfo ):
36
+ """Model for full GPU device information."""
36
37
37
38
pass
38
39
39
40
40
- def retrieve_cuda_info () -> Dict [int , GpuInfo ]:
41
- """Retrieve CUDA device information.
42
-
43
- Returns:
44
- CUDA device information.
45
- """
46
- devices = {}
47
- for i in range (pynvml .nvmlDeviceGetCount ()):
48
- handle = pynvml .nvmlDeviceGetHandleByIndex (i )
49
- uuid = pynvml .nvmlDeviceGetUUID (handle )
50
- name = pynvml .nvmlDeviceGetName (handle )
51
- memory_info = pynvml .nvmlDeviceGetMemoryInfo (handle )
52
- major , minor = pynvml .nvmlDeviceGetCudaComputeCapability (handle )
53
- utilization_rates = pynvml .nvmlDeviceGetUtilizationRates (handle )
54
- devices [i ] = GpuInfo (
55
- id = uuid ,
56
- name = name ,
57
- memory_total = memory_info .total ,
58
- memory_free = memory_info .free ,
59
- major = major ,
60
- minor = minor ,
61
- utilization_compute = utilization_rates .gpu ,
62
- utilization_memory = utilization_rates .memory ,
63
- )
64
- return devices
65
-
66
-
67
- def get_gpu_info () -> Dict [int , GpuComputeInfo ]:
68
- """Get detailed GPU compute information.
69
-
70
- Returns:
71
- The detailed GPU compute information.
72
- """
73
- basic_info = retrieve_cuda_info ()
74
- return {
75
- i : GpuComputeInfo (
76
- id = info .id ,
77
- name = info .name ,
78
- memory_total = info .memory_total ,
79
- memory_free = info .memory_free ,
80
- major = info .major ,
81
- minor = info .minor ,
82
- )
83
- for i , info in basic_info .items ()
84
- }
85
-
86
-
87
- def get_gpu_stats () -> Dict [int , GpuUtilizationInfo ]:
88
- """Get real-time GPU utilization statistics.
89
-
90
- Returns:
91
- The real-time GPU utilization statistics.
92
- """
93
- basic_info = retrieve_cuda_info ()
94
- return {
95
- i : GpuUtilizationInfo (
96
- id = info .id ,
97
- name = info .name ,
98
- memory_total = info .memory_total ,
99
- memory_free = info .memory_free ,
100
- utilization_compute = info .utilization_compute ,
101
- utilization_memory = info .utilization_memory ,
102
- )
103
- for i , info in basic_info .items ()
104
- }
41
+ class HardwareInfo :
42
+ """Class used to retrieve hardware information about the host machine."""
43
+
44
+ def __init__ (self ):
45
+ """Initialize the HardwareInfo class and hardware info retrieval services."""
46
+ self ._initialized = False
47
+ self ._initialize_nvml ()
48
+ atexit .register (self ._shutdown_nvml )
49
+
50
+ def _initialize_nvml (self ) -> None :
51
+ """Initialize NVML."""
52
+ if not self ._initialized :
53
+ try :
54
+ pynvml .nvmlInit ()
55
+ self ._initialized = True
56
+ logger .info ("NVML initialized successfully." )
57
+ except pynvml .NVMLError as e :
58
+ logger .error (f"Failed to initialize NVML: { e } " )
59
+
60
+ def _shutdown_nvml (self ) -> None :
61
+ """Shutdown NVML."""
62
+ if self ._initialized :
63
+ try :
64
+ pynvml .nvmlShutdown ()
65
+ self ._initialized = False
66
+ logger .info ("NVML shutdown successfully." )
67
+ except pynvml .NVMLError as e :
68
+ logger .error (f"Failed to shutdown NVML: { e } " )
69
+
70
+ def get_cuda_info (self ) -> Dict [int , GPUInfo ]:
71
+ """Retrieve CUDA device information.
72
+
73
+ Returns:
74
+ A dictionary mapping GPU device IDs to their information.
75
+ """
76
+ devices = {}
77
+ if not self ._initialized :
78
+ logger .warning (
79
+ "NVML is not initialized. Cannot retrieve CUDA device information."
80
+ )
81
+ return devices
82
+
83
+ try :
84
+ for i in range (pynvml .nvmlDeviceGetCount ()):
85
+ handle = pynvml .nvmlDeviceGetHandleByIndex (i )
86
+ uuid = pynvml .nvmlDeviceGetUUID (handle )
87
+ name = pynvml .nvmlDeviceGetName (handle )
88
+ memory_info = pynvml .nvmlDeviceGetMemoryInfo (handle )
89
+ major , minor = pynvml .nvmlDeviceGetCudaComputeCapability (handle )
90
+ utilization_rates = pynvml .nvmlDeviceGetUtilizationRates (handle )
91
+ devices [i ] = GPUInfo (
92
+ id = uuid ,
93
+ name = name ,
94
+ memory_total = memory_info .total ,
95
+ memory_free = memory_info .free ,
96
+ major = major ,
97
+ minor = minor ,
98
+ utilization_compute = utilization_rates .gpu ,
99
+ utilization_memory = utilization_rates .memory ,
100
+ )
101
+ except pynvml .NVMLError as e :
102
+ logger .warning (f"Failed to retrieve CUDA device information: { e } " )
103
+ return devices
104
+
105
+ def get_gpu_compute_info (self ) -> Dict [int , GPUComputeInfo ]:
106
+ """Get detailed GPU compute information.
107
+
108
+ Returns:
109
+ A dictionary mapping GPU device IDs to their compute information.
110
+ """
111
+ basic_info = self .get_cuda_info ()
112
+ return {
113
+ i : GPUComputeInfo (
114
+ id = info .id ,
115
+ name = info .name ,
116
+ memory_total = info .memory_total ,
117
+ memory_free = info .memory_free ,
118
+ major = info .major ,
119
+ minor = info .minor ,
120
+ )
121
+ for i , info in basic_info .items ()
122
+ }
123
+
124
+ def log_gpu_compute_info (self ):
125
+ """Log detailed GPU compute information."""
126
+ devices = self .get_gpu_compute_info ()
127
+ if devices :
128
+ logger .info ("CUDA devices available:" )
129
+ for device_id , device_info in devices .items ():
130
+ logger .info (f"Device { device_id } : { device_info } " )
131
+ else :
132
+ logger .info ("No CUDA devices available." )
133
+
134
+ def get_gpu_utilization_stats (self ) -> Dict [int , GPUUtilizationInfo ]:
135
+ """Get GPU utilization statistics.
136
+
137
+ Returns:
138
+ A dictionary mapping GPU device IDs to their utilization statistics.
139
+ """
140
+ basic_info = self .get_cuda_info ()
141
+ return {
142
+ i : GPUUtilizationInfo (
143
+ id = info .id ,
144
+ name = info .name ,
145
+ memory_total = info .memory_total ,
146
+ memory_free = info .memory_free ,
147
+ utilization_compute = info .utilization_compute ,
148
+ utilization_memory = info .utilization_memory ,
149
+ )
150
+ for i , info in basic_info .items ()
151
+ }
0 commit comments