Skip to content

Commit 0f5b66f

Browse files
authored
fix: prevents nvml runtime error (#292)
This commit prevents an nvml runtime error to be thrown when the nvmllibrary is not found. It also simplifies the hardware info logic.
1 parent 695c72f commit 0f5b66f

File tree

9 files changed

+300
-225
lines changed

9 files changed

+300
-225
lines changed

runner/app/main.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,10 @@
22
import os
33
from contextlib import asynccontextmanager
44

5-
import app
65
from app.routes import health, hardware
76
from fastapi import FastAPI
87
from fastapi.routing import APIRoute
9-
from app.utils.hardware import get_gpu_info
10-
from app.utils.nvml_manager import nvml_manager
8+
from app.utils.hardware import HardwareInfo
119

1210
logger = logging.getLogger(__name__)
1311

@@ -16,7 +14,8 @@
1614
async def lifespan(app: FastAPI):
1715
config_logging()
1816

19-
nvml_manager.initialize()
17+
# Create application wide hardware info service.
18+
app.hardware_info_service = HardwareInfo()
2019

2120
app.include_router(health.router)
2221
app.include_router(hardware.router)
@@ -27,13 +26,11 @@ async def lifespan(app: FastAPI):
2726
app.pipeline = load_pipeline(pipeline, model_id)
2827
app.include_router(load_route(pipeline))
2928

30-
print_cuda_devices()
29+
app.hardware_info_service.log_gpu_compute_info()
3130
logger.info(f"Started up with pipeline {app.pipeline}")
3231

3332
yield
3433

35-
nvml_manager.shutdown()
36-
3734
logger.info("Shutting down")
3835

3936

@@ -143,13 +140,6 @@ def config_logging():
143140
)
144141

145142

146-
def print_cuda_devices():
147-
devices = get_gpu_info()
148-
logger.info("Cuda devices available:")
149-
for device in devices:
150-
logger.info(devices[device])
151-
152-
153143
def use_route_names_as_operation_ids(app: FastAPI) -> None:
154144
for route in app.routes:
155145
if isinstance(route, APIRoute):

runner/app/routes/hardware.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,10 @@
22
from typing import Dict
33

44
from app.utils.hardware import (
5-
GpuComputeInfo,
6-
GpuUtilizationInfo,
7-
get_gpu_info,
8-
get_gpu_stats,
5+
GPUComputeInfo,
6+
GPUUtilizationInfo
97
)
10-
from fastapi import APIRouter
8+
from fastapi import APIRouter, Request
119
from pydantic import BaseModel
1210

1311
router = APIRouter()
@@ -18,15 +16,15 @@ class HardwareInformation(BaseModel):
1816

1917
pipeline: str
2018
model_id: str
21-
gpu_info: Dict[int, GpuComputeInfo]
19+
gpu_info: Dict[int, GPUComputeInfo]
2220

2321

2422
class HardwareStats(BaseModel):
2523
"""Response model for real-time GPU statistics."""
2624

2725
pipeline: str
2826
model_id: str
29-
gpu_stats: Dict[int, GpuUtilizationInfo]
27+
gpu_stats: Dict[int, GPUUtilizationInfo]
3028

3129

3230
@router.get(
@@ -39,11 +37,11 @@ class HardwareStats(BaseModel):
3937
response_model=HardwareInformation,
4038
include_in_schema=False,
4139
)
42-
async def hardware_info():
40+
async def hardware_info(request: Request):
4341
return HardwareInformation(
4442
pipeline=os.environ["PIPELINE"],
4543
model_id=os.environ["MODEL_ID"],
46-
gpu_info=get_gpu_info(),
44+
gpu_info=request.app.hardware_info_service.get_gpu_compute_info(),
4745
)
4846

4947

@@ -57,9 +55,9 @@ async def hardware_info():
5755
response_model=HardwareStats,
5856
include_in_schema=False,
5957
)
60-
async def hardware_stats():
58+
async def hardware_stats(request: Request):
6159
return HardwareStats(
6260
pipeline=os.environ["PIPELINE"],
6361
model_id=os.environ["MODEL_ID"],
64-
gpu_stats=get_gpu_stats(),
62+
gpu_stats=request.app.hardware_info_service.get_gpu_utilization_stats(),
6563
)

runner/app/utils/hardware.py

Lines changed: 118 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
from pydantic import BaseModel
55
import logging
66
import pynvml
7+
import atexit
78

89
logger = logging.getLogger(__name__)
910

1011

11-
class GpuBaseInfo(BaseModel):
12+
class GPUBaseInfo(BaseModel):
1213
"""Model for general GPU information."""
1314

1415
id: str
@@ -17,88 +18,134 @@ class GpuBaseInfo(BaseModel):
1718
memory_free: int
1819

1920

20-
class GpuComputeInfo(GpuBaseInfo):
21+
class GPUComputeInfo(GPUBaseInfo):
2122
"""Model for detailed GPU compute information."""
2223

2324
major: int
2425
minor: int
2526

2627

27-
class GpuUtilizationInfo(GpuBaseInfo):
28-
"""Model for real-time GPU utilization statistics."""
28+
class GPUUtilizationInfo(GPUBaseInfo):
29+
"""Model for GPU utilization statistics."""
2930

3031
utilization_compute: int
3132
utilization_memory: int
3233

3334

34-
class GpuInfo(GpuComputeInfo, GpuUtilizationInfo):
35-
"""Model for full CUDA device information."""
35+
class GPUInfo(GPUComputeInfo, GPUUtilizationInfo):
36+
"""Model for full GPU device information."""
3637

3738
pass
3839

3940

40-
def retrieve_cuda_info() -> Dict[int, GpuInfo]:
41-
"""Retrieve CUDA device information.
42-
43-
Returns:
44-
CUDA device information.
45-
"""
46-
devices = {}
47-
for i in range(pynvml.nvmlDeviceGetCount()):
48-
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
49-
uuid = pynvml.nvmlDeviceGetUUID(handle)
50-
name = pynvml.nvmlDeviceGetName(handle)
51-
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
52-
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
53-
utilization_rates = pynvml.nvmlDeviceGetUtilizationRates(handle)
54-
devices[i] = GpuInfo(
55-
id=uuid,
56-
name=name,
57-
memory_total=memory_info.total,
58-
memory_free=memory_info.free,
59-
major=major,
60-
minor=minor,
61-
utilization_compute=utilization_rates.gpu,
62-
utilization_memory=utilization_rates.memory,
63-
)
64-
return devices
65-
66-
67-
def get_gpu_info() -> Dict[int, GpuComputeInfo]:
68-
"""Get detailed GPU compute information.
69-
70-
Returns:
71-
The detailed GPU compute information.
72-
"""
73-
basic_info = retrieve_cuda_info()
74-
return {
75-
i: GpuComputeInfo(
76-
id=info.id,
77-
name=info.name,
78-
memory_total=info.memory_total,
79-
memory_free=info.memory_free,
80-
major=info.major,
81-
minor=info.minor,
82-
)
83-
for i, info in basic_info.items()
84-
}
85-
86-
87-
def get_gpu_stats() -> Dict[int, GpuUtilizationInfo]:
88-
"""Get real-time GPU utilization statistics.
89-
90-
Returns:
91-
The real-time GPU utilization statistics.
92-
"""
93-
basic_info = retrieve_cuda_info()
94-
return {
95-
i: GpuUtilizationInfo(
96-
id=info.id,
97-
name=info.name,
98-
memory_total=info.memory_total,
99-
memory_free=info.memory_free,
100-
utilization_compute=info.utilization_compute,
101-
utilization_memory=info.utilization_memory,
102-
)
103-
for i, info in basic_info.items()
104-
}
41+
class HardwareInfo:
42+
"""Class used to retrieve hardware information about the host machine."""
43+
44+
def __init__(self):
45+
"""Initialize the HardwareInfo class and hardware info retrieval services."""
46+
self._initialized = False
47+
self._initialize_nvml()
48+
atexit.register(self._shutdown_nvml)
49+
50+
def _initialize_nvml(self) -> None:
51+
"""Initialize NVML."""
52+
if not self._initialized:
53+
try:
54+
pynvml.nvmlInit()
55+
self._initialized = True
56+
logger.info("NVML initialized successfully.")
57+
except pynvml.NVMLError as e:
58+
logger.error(f"Failed to initialize NVML: {e}")
59+
60+
def _shutdown_nvml(self) -> None:
61+
"""Shutdown NVML."""
62+
if self._initialized:
63+
try:
64+
pynvml.nvmlShutdown()
65+
self._initialized = False
66+
logger.info("NVML shutdown successfully.")
67+
except pynvml.NVMLError as e:
68+
logger.error(f"Failed to shutdown NVML: {e}")
69+
70+
def get_cuda_info(self) -> Dict[int, GPUInfo]:
71+
"""Retrieve CUDA device information.
72+
73+
Returns:
74+
A dictionary mapping GPU device IDs to their information.
75+
"""
76+
devices = {}
77+
if not self._initialized:
78+
logger.warning(
79+
"NVML is not initialized. Cannot retrieve CUDA device information."
80+
)
81+
return devices
82+
83+
try:
84+
for i in range(pynvml.nvmlDeviceGetCount()):
85+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
86+
uuid = pynvml.nvmlDeviceGetUUID(handle)
87+
name = pynvml.nvmlDeviceGetName(handle)
88+
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
89+
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
90+
utilization_rates = pynvml.nvmlDeviceGetUtilizationRates(handle)
91+
devices[i] = GPUInfo(
92+
id=uuid,
93+
name=name,
94+
memory_total=memory_info.total,
95+
memory_free=memory_info.free,
96+
major=major,
97+
minor=minor,
98+
utilization_compute=utilization_rates.gpu,
99+
utilization_memory=utilization_rates.memory,
100+
)
101+
except pynvml.NVMLError as e:
102+
logger.warning(f"Failed to retrieve CUDA device information: {e}")
103+
return devices
104+
105+
def get_gpu_compute_info(self) -> Dict[int, GPUComputeInfo]:
106+
"""Get detailed GPU compute information.
107+
108+
Returns:
109+
A dictionary mapping GPU device IDs to their compute information.
110+
"""
111+
basic_info = self.get_cuda_info()
112+
return {
113+
i: GPUComputeInfo(
114+
id=info.id,
115+
name=info.name,
116+
memory_total=info.memory_total,
117+
memory_free=info.memory_free,
118+
major=info.major,
119+
minor=info.minor,
120+
)
121+
for i, info in basic_info.items()
122+
}
123+
124+
def log_gpu_compute_info(self):
125+
"""Log detailed GPU compute information."""
126+
devices = self.get_gpu_compute_info()
127+
if devices:
128+
logger.info("CUDA devices available:")
129+
for device_id, device_info in devices.items():
130+
logger.info(f"Device {device_id}: {device_info}")
131+
else:
132+
logger.info("No CUDA devices available.")
133+
134+
def get_gpu_utilization_stats(self) -> Dict[int, GPUUtilizationInfo]:
135+
"""Get GPU utilization statistics.
136+
137+
Returns:
138+
A dictionary mapping GPU device IDs to their utilization statistics.
139+
"""
140+
basic_info = self.get_cuda_info()
141+
return {
142+
i: GPUUtilizationInfo(
143+
id=info.id,
144+
name=info.name,
145+
memory_total=info.memory_total,
146+
memory_free=info.memory_free,
147+
utilization_compute=info.utilization_compute,
148+
utilization_memory=info.utilization_memory,
149+
)
150+
for i, info in basic_info.items()
151+
}

runner/app/utils/nvml_manager.py

Lines changed: 0 additions & 37 deletions
This file was deleted.

0 commit comments

Comments
 (0)