From b48b7be66b92a6f9ca9b5a3c71904fb2c9321700 Mon Sep 17 00:00:00 2001 From: manosnoam Date: Thu, 11 Jan 2024 11:48:08 +0000 Subject: [PATCH] Wait for GPU Operator Subscription, InstallPlan & Deployment to complete Signed-off-by: manosnoam --- .../Resources/Provisioning/GPU/gpu_deploy.sh | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index f1a28336d..9a9eda8bf 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -1,17 +1,29 @@ #!/bin/bash set -e -# Make changes to gpu install file +echo "Create and apply 'gpu_install.yaml' to install Nvidia GPU Operator" GPU_INSTALL_DIR="$(dirname "$0")" -CHANNEL=$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}') +CHANNEL="$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}')" -CSVNAME=$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -ojson | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV') +CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -o json | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV')" -sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" ${GPU_INSTALL_DIR}/gpu_install.yaml +sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml" -oc apply -f ${GPU_INSTALL_DIR}/gpu_install.yaml +oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml" + +echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete" + +oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator subs nfd + +oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator subs gpu-operator-certified + +oc wait --timeout=3m --for condition=Installed -n nvidia-gpu-operator installplan --all + +oc rollout status --watch --timeout=3m -n nvidia-gpu-operator deployment gpu-operator + +oc rollout status --watch --timeout=3m -n nvidia-gpu-operator deployment nfd-controller-manager function wait_until_pod_ready_status() { local timeout_seconds=1200 @@ -51,8 +63,8 @@ function rerun_accelerator_migration() { } wait_until_pod_ready_status "gpu-operator" -oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml -oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json +oc apply -f "$GPU_INSTALL_DIR/nfd_deploy.yaml" +oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json oc apply -f clusterpolicy.json wait_until_pod_ready_status "nvidia-device-plugin-daemonset" wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"