Skip to content

Commit 81a52fc

Browse files
authored
Merge pull request #29 from NVIDIA/vgpu-manager-17.2-fixes
vGPU Manager fixes required for vGPU 17.2
2 parents 617b57d + 31fb669 commit 81a52fc

File tree

3 files changed

+21
-3
lines changed

3 files changed

+21
-3
lines changed

vgpu-manager/rhel8/nvidia-driver

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ set -xe
55

66
DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
77
DRIVER_RESET_RETRIES=10
8+
DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:-15}
89
RUN_DIR=/run/nvidia
910

1011
# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -45,7 +46,7 @@ _install_driver() {
4546

4647
# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons
4748
_load_driver() {
48-
/usr/bin/nvidia-vgpud &
49+
/usr/bin/nvidia-vgpud
4950
/usr/bin/nvidia-vgpu-mgr &
5051

5152
# check nvidia drivers are loaded
@@ -59,6 +60,11 @@ _load_driver() {
5960
# Enable virtual functions for all physical GPUs on the node that support SR-IOV.
6061
# Retry logic is to account for when the driver is busy (i.e. during driver initialization)
6162
_enable_vfs() {
63+
# Wait before attempting to create VFs to ensure the driver has finished initializing.
64+
# This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero
65+
# exit code even though VF creation fails.
66+
sleep $DELAY_BEFORE_VF_CREATION
67+
6268
local retry
6369
for ((retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++)); do
6470
if /usr/lib/nvidia/sriov-manage -e ALL; then

vgpu-manager/ubuntu20.04/nvidia-driver

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ set -xeu
55
DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
66
DRIVER_ARCH=${DRIVER_ARCH:?"Missing driver arch"}
77
DRIVER_RESET_RETRIES=10
8+
DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:-15}
89
KERNEL_VERSION=$(uname -r)
910
RUN_DIR=/run/nvidia
1011

@@ -119,7 +120,7 @@ _install_driver() {
119120

120121
# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons
121122
_load_driver() {
122-
/usr/bin/nvidia-vgpud &
123+
/usr/bin/nvidia-vgpud
123124
/usr/bin/nvidia-vgpu-mgr &
124125

125126
# check nvidia drivers are loaded
@@ -133,6 +134,11 @@ _load_driver() {
133134
# Enable virtual functions for all physical GPUs on the node that support SR-IOV.
134135
# Retry logic is to account for when the driver is busy (i.e. during driver initialization)
135136
_enable_vfs() {
137+
# Wait before attempting to create VFs to ensure the driver has finished initializing.
138+
# This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero
139+
# exit code even though VF creation fails.
140+
sleep $DELAY_BEFORE_VF_CREATION
141+
136142
local retry
137143
for ((retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++)); do
138144
if /usr/lib/nvidia/sriov-manage -e ALL; then

vgpu-manager/ubuntu22.04/nvidia-driver

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ set -xeu
55
DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
66
DRIVER_ARCH=${DRIVER_ARCH:?"Missing driver arch"}
77
DRIVER_RESET_RETRIES=10
8+
DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:-15}
89
KERNEL_VERSION=$(uname -r)
910
RUN_DIR=/run/nvidia
1011

@@ -119,7 +120,7 @@ _create_dev_char_directory() {
119120

120121
# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons
121122
_load_driver() {
122-
/usr/bin/nvidia-vgpud &
123+
/usr/bin/nvidia-vgpud
123124
/usr/bin/nvidia-vgpu-mgr &
124125

125126
# check nvidia drivers are loaded
@@ -133,6 +134,11 @@ _load_driver() {
133134
# Enable virtual functions for all physical GPUs on the node that support SR-IOV.
134135
# Retry logic is to account for when the driver is busy (i.e. during driver initialization)
135136
_enable_vfs() {
137+
# Wait before attempting to create VFs to ensure the driver has finished initializing.
138+
# This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero
139+
# exit code even though VF creation fails.
140+
sleep $DELAY_BEFORE_VF_CREATION
141+
136142
local retry
137143
for ((retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++)); do
138144
if /usr/lib/nvidia/sriov-manage -e ALL; then

0 commit comments

Comments
 (0)