File tree Expand file tree Collapse file tree 3 files changed +21
-3
lines changed Expand file tree Collapse file tree 3 files changed +21
-3
lines changed Original file line number Diff line number Diff line change 5
5
6
6
DRIVER_VERSION=${DRIVER_VERSION:? " Missing driver version" }
7
7
DRIVER_RESET_RETRIES=10
8
+ DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:- 15}
8
9
RUN_DIR=/run/nvidia
9
10
10
11
# Mount the driver rootfs into the run directory with the exception of sysfs.
@@ -45,7 +46,7 @@ _install_driver() {
45
46
46
47
# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons
47
48
_load_driver () {
48
- /usr/bin/nvidia-vgpud &
49
+ /usr/bin/nvidia-vgpud
49
50
/usr/bin/nvidia-vgpu-mgr &
50
51
51
52
# check nvidia drivers are loaded
@@ -59,6 +60,11 @@ _load_driver() {
59
60
# Enable virtual functions for all physical GPUs on the node that support SR-IOV.
60
61
# Retry logic is to account for when the driver is busy (i.e. during driver initialization)
61
62
_enable_vfs () {
63
+ # Wait before attempting to create VFs to ensure the driver has finished initializing.
64
+ # This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero
65
+ # exit code even though VF creation fails.
66
+ sleep $DELAY_BEFORE_VF_CREATION
67
+
62
68
local retry
63
69
for (( retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++ )) ; do
64
70
if /usr/lib/nvidia/sriov-manage -e ALL; then
Original file line number Diff line number Diff line change 5
5
DRIVER_VERSION=${DRIVER_VERSION:? " Missing driver version" }
6
6
DRIVER_ARCH=${DRIVER_ARCH:? " Missing driver arch" }
7
7
DRIVER_RESET_RETRIES=10
8
+ DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:- 15}
8
9
KERNEL_VERSION=$( uname -r)
9
10
RUN_DIR=/run/nvidia
10
11
@@ -119,7 +120,7 @@ _install_driver() {
119
120
120
121
# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons
121
122
_load_driver () {
122
- /usr/bin/nvidia-vgpud &
123
+ /usr/bin/nvidia-vgpud
123
124
/usr/bin/nvidia-vgpu-mgr &
124
125
125
126
# check nvidia drivers are loaded
@@ -133,6 +134,11 @@ _load_driver() {
133
134
# Enable virtual functions for all physical GPUs on the node that support SR-IOV.
134
135
# Retry logic is to account for when the driver is busy (i.e. during driver initialization)
135
136
_enable_vfs () {
137
+ # Wait before attempting to create VFs to ensure the driver has finished initializing.
138
+ # This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero
139
+ # exit code even though VF creation fails.
140
+ sleep $DELAY_BEFORE_VF_CREATION
141
+
136
142
local retry
137
143
for (( retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++ )) ; do
138
144
if /usr/lib/nvidia/sriov-manage -e ALL; then
Original file line number Diff line number Diff line change 5
5
DRIVER_VERSION=${DRIVER_VERSION:? " Missing driver version" }
6
6
DRIVER_ARCH=${DRIVER_ARCH:? " Missing driver arch" }
7
7
DRIVER_RESET_RETRIES=10
8
+ DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:- 15}
8
9
KERNEL_VERSION=$( uname -r)
9
10
RUN_DIR=/run/nvidia
10
11
@@ -119,7 +120,7 @@ _create_dev_char_directory() {
119
120
120
121
# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons
121
122
_load_driver () {
122
- /usr/bin/nvidia-vgpud &
123
+ /usr/bin/nvidia-vgpud
123
124
/usr/bin/nvidia-vgpu-mgr &
124
125
125
126
# check nvidia drivers are loaded
@@ -133,6 +134,11 @@ _load_driver() {
133
134
# Enable virtual functions for all physical GPUs on the node that support SR-IOV.
134
135
# Retry logic is to account for when the driver is busy (i.e. during driver initialization)
135
136
_enable_vfs () {
137
+ # Wait before attempting to create VFs to ensure the driver has finished initializing.
138
+ # This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero
139
+ # exit code even though VF creation fails.
140
+ sleep $DELAY_BEFORE_VF_CREATION
141
+
136
142
local retry
137
143
for (( retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++ )) ; do
138
144
if /usr/lib/nvidia/sriov-manage -e ALL; then
You can’t perform that action at this time.
0 commit comments