Skip to content

Commit

Permalink
Wait for GPU Operator Subscription, InstallPlan & Deployment (#1108)
Browse files Browse the repository at this point in the history
To avoid situation where gpu_deploy.sh waits for nvidia-gpu-operator
pods,
before they were created by the Nvidia GPU Operator installation,
it is required to initially wait for the Operator Subscription,
InstallPlan & Deployment to complete.
  • Loading branch information
manosnoam authored Jan 15, 2024
2 parents afa1de6 + 4ba4bd8 commit b0c8743
Showing 1 changed file with 36 additions and 10 deletions.
46 changes: 36 additions & 10 deletions ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,51 @@
#!/bin/bash
set -e

# Make changes to gpu install file
echo "Create and apply 'gpu_install.yaml' to install Nvidia GPU Operator"

GPU_INSTALL_DIR="$(dirname "$0")"

CHANNEL=$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}')
CHANNEL="$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}')"

CSVNAME=$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -ojson | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV')
CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -o json | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV')"

sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" ${GPU_INSTALL_DIR}/gpu_install.yaml
sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f ${GPU_INSTALL_DIR}/gpu_install.yaml
oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"

echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub nfd

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified

oc wait --timeout=3m --for condition=Installed -n nvidia-gpu-operator installplan --all

oc rollout status --watch --timeout=3m -n nvidia-gpu-operator deploy gpu-operator

oc rollout status --watch --timeout=3m -n nvidia-gpu-operator deploy nfd-controller-manager

oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpressions[].operator}'=Exists operator nfd.nvidia-gpu-operator

oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpressions[].operator}'=Exists operator gpu-operator-certified.nvidia-gpu-operator

function wait_until_pod_ready_status() {
local timeout_seconds=1200
local pod_label=$1
local namespace=nvidia-gpu-operator

echo "Waiting until GPU pods of '$pod_label' in namespace '$namespace' are in running state..."
oc wait --timeout=${timeout_seconds}s --for=condition=ready pod -n $namespace -l app="$pod_label"
local timeout=240
start_time=$(date +%s)
while [ $(($(date +%s) - start_time)) -lt $timeout ]; do
pod_status="$(oc get pod -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)"
echo "$pod_status"
if [ -n "$pod_status" ]; then
echo "Waiting until GPU pods of '$pod_label' in namespace '$namespace' are in running state..."
oc wait --timeout="${timeout_seconds}s" --for=condition=ready pod -n "$namespace" -l app="$pod_label"
break
fi
echo "Waiting for pod with label app='$pod_label' to be present..."
sleep 5
done
}

function rerun_accelerator_migration() {
Expand Down Expand Up @@ -51,8 +77,8 @@ function rerun_accelerator_migration() {
}

wait_until_pod_ready_status "gpu-operator"
oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml
oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json
oc apply -f "$GPU_INSTALL_DIR/nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
Expand Down

0 comments on commit b0c8743

Please sign in to comment.