Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vGPU Manager fixes required for vGPU 17.2 #29

Merged
merged 2 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion vgpu-manager/rhel8/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set -xe

DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
DRIVER_RESET_RETRIES=10
DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:-15}
RUN_DIR=/run/nvidia

# Mount the driver rootfs into the run directory with the exception of sysfs.
Expand Down Expand Up @@ -45,7 +46,7 @@ _install_driver() {

# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons
_load_driver() {
/usr/bin/nvidia-vgpud &
/usr/bin/nvidia-vgpud
/usr/bin/nvidia-vgpu-mgr &

# check nvidia drivers are loaded
Expand All @@ -59,6 +60,11 @@ _load_driver() {
# Enable virtual functions for all physical GPUs on the node that support SR-IOV.
# Retry logic is to account for when the driver is busy (i.e. during driver initialization)
_enable_vfs() {
# Wait before attempting to create VFs to ensure the driver has finished initializing.
# This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero
# exit code even though VF creation fails.
sleep $DELAY_BEFORE_VF_CREATION

local retry
for ((retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++)); do
if /usr/lib/nvidia/sriov-manage -e ALL; then
Expand Down
8 changes: 7 additions & 1 deletion vgpu-manager/ubuntu20.04/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set -xeu
DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
DRIVER_ARCH=${DRIVER_ARCH:?"Missing driver arch"}
DRIVER_RESET_RETRIES=10
DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:-15}
KERNEL_VERSION=$(uname -r)
RUN_DIR=/run/nvidia

Expand Down Expand Up @@ -119,7 +120,7 @@ _install_driver() {

# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons
_load_driver() {
/usr/bin/nvidia-vgpud &
/usr/bin/nvidia-vgpud
/usr/bin/nvidia-vgpu-mgr &

# check nvidia drivers are loaded
Expand All @@ -133,6 +134,11 @@ _load_driver() {
# Enable virtual functions for all physical GPUs on the node that support SR-IOV.
# Retry logic is to account for when the driver is busy (i.e. during driver initialization)
_enable_vfs() {
# Wait before attempting to create VFs to ensure the driver has finished initializing.
# This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero
# exit code even though VF creation fails.
sleep $DELAY_BEFORE_VF_CREATION

local retry
for ((retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++)); do
if /usr/lib/nvidia/sriov-manage -e ALL; then
Expand Down
8 changes: 7 additions & 1 deletion vgpu-manager/ubuntu22.04/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set -xeu
DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
DRIVER_ARCH=${DRIVER_ARCH:?"Missing driver arch"}
DRIVER_RESET_RETRIES=10
DELAY_BEFORE_VF_CREATION=${DELAY_BEFORE_VF_CREATION:-15}
KERNEL_VERSION=$(uname -r)
RUN_DIR=/run/nvidia

Expand Down Expand Up @@ -119,7 +120,7 @@ _create_dev_char_directory() {

# Currently _install_driver() takes care of loading nvidia modules. Just need to start necessary vgpu daemons
_load_driver() {
/usr/bin/nvidia-vgpud &
/usr/bin/nvidia-vgpud
/usr/bin/nvidia-vgpu-mgr &

# check nvidia drivers are loaded
Expand All @@ -133,6 +134,11 @@ _load_driver() {
# Enable virtual functions for all physical GPUs on the node that support SR-IOV.
# Retry logic is to account for when the driver is busy (i.e. during driver initialization)
_enable_vfs() {
# Wait before attempting to create VFs to ensure the driver has finished initializing.
# This is a WAR for a bug in vGPU 17.2 where sriov-manage does not return a non-zero
# exit code even though VF creation fails.
sleep $DELAY_BEFORE_VF_CREATION

local retry
for ((retry = 0 ; retry <= $DRIVER_RESET_RETRIES ; retry++)); do
if /usr/lib/nvidia/sriov-manage -e ALL; then
Expand Down