Skip to content

Commit

Permalink
Wait for GPU Operator Subscription, InstallPlan & Deployment to complete
Browse files Browse the repository at this point in the history
Signed-off-by: manosnoam <[email protected]>
  • Loading branch information
manosnoam committed Jan 14, 2024
1 parent afa1de6 commit b48b7be
Showing 1 changed file with 19 additions and 7 deletions.
26 changes: 19 additions & 7 deletions ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
#!/bin/bash
set -e

# Make changes to gpu install file
echo "Create and apply 'gpu_install.yaml' to install Nvidia GPU Operator"

GPU_INSTALL_DIR="$(dirname "$0")"

CHANNEL=$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}')
CHANNEL="$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}')"

CSVNAME=$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -ojson | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV')
CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -o json | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV')"

Check warning on line 10 in ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh

View workflow job for this annotation

GitHub Actions / shellcheck linter

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. [SC2086](https://github.com/koalaman/shellcheck/wiki/SC2086) Raw Output: ./ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh:10:140:info:Double quote to prevent globbing and word splitting. [SC2086](https://github.com/koalaman/shellcheck/wiki/SC2086)

Check failure on line 10 in ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh

View workflow job for this annotation

GitHub Actions / shellcheck linter

[shellcheck (suggestion)] reported by reviewdog 🐶 Raw Output: ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh:10:-CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -o json | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV')" ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh:10:+CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -o json | jq -r '.status.channels[] | select(.name == "'"$CHANNEL"'") | .currentCSV')"

sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" ${GPU_INSTALL_DIR}/gpu_install.yaml
sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f ${GPU_INSTALL_DIR}/gpu_install.yaml
oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"

echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator subs nfd

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator subs gpu-operator-certified

oc wait --timeout=3m --for condition=Installed -n nvidia-gpu-operator installplan --all

oc rollout status --watch --timeout=3m -n nvidia-gpu-operator deployment gpu-operator

oc rollout status --watch --timeout=3m -n nvidia-gpu-operator deployment nfd-controller-manager

function wait_until_pod_ready_status() {
local timeout_seconds=1200
Expand Down Expand Up @@ -51,8 +63,8 @@ function rerun_accelerator_migration() {
}

wait_until_pod_ready_status "gpu-operator"
oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml
oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json
oc apply -f "$GPU_INSTALL_DIR/nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
Expand Down

0 comments on commit b48b7be

Please sign in to comment.