Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for amd operator installation #1498

Merged
merged 3 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
apiVersion: v1
kind: Namespace
metadata:
name: openshift-amd-gpu

---

apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
name: openshift-amd-gpu-operator-group
namespace: openshift-amd-gpu
spec: {}

---

apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
name: amd-gpu-operator
namespace: openshift-amd-gpu
spec:
channel: alpha
installPlanApproval: Automatic
name: amd-gpu-operator
source: community-operators
sourceNamespace: openshift-marketplace

136 changes: 136 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/amd_operator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/bin/bash
set -e

GPU_INSTALL_DIR="$(dirname "$0")"

function create_registry_network() {
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"storage":{"emptyDir":{}}}}'
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"managementState":"Managed"}}'
echo "Internal registry network created."
}

function check_registry() {
registry_pod=$(oc get pod -l docker-registry=default -n openshift-image-registry --no-headers -o custom-columns=":metadata.name")
if [ -n "$registry_pod" ]; then
echo "Internal registry pod ($registry_pod) is present."
return 0 # Success
else
echo "Internal registry pod is not present."
create_registry_network
return 1 # Failure
fi
}
function wait_while {
local seconds timeout interval
interval=2
seconds=0
timeout=$1
shift
while eval "$*"; do
seconds=$(( seconds + interval ))
sleep $interval
echo -n '.'
[[ $seconds -gt $timeout ]] && echo "Time out of ${timeout} exceeded" && return 1
done
if [[ "$seconds" != '0' ]]; then
echo ''
fi
return 0
}

has_csv_succeeded() {
local ns=$1
local subscription=$2
local csv
csv=$(oc get subscriptions.operators.coreos.com "${subscription}" -n "${ns}" -o=custom-columns=CURRENT_CSV:.status.currentCSV --no-headers=true)
if [ x"$csv" != "x" ] && [ x"$csv" != x"<none>" ]
then
phase=$(oc get clusterserviceversions.operators.coreos.com -n "${ns}" "${csv}" -o=custom-columns=PHASE:.status.phase --no-headers=true)
if [ "$phase" = "Succeeded" ]
then
return 0
fi
fi

return 1
}

function create_devconfig() {
oc create -f - <<EOF
kind: DeviceConfig
apiVersion: amd.io/v1alpha1
metadata:
name: dc-internal-registry
namespace: openshift-amd-gpu
EOF
}

function wait_until_pod_ready_status() {
local timeout_seconds=1200
local pod_label=$1
local namespace=$2
local timeout=240
start_time=$(date +%s)
while [ $(($(date +%s) - start_time)) -lt $timeout ]; do
pod_status="$(oc get pod -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)"
daemon_status="$(oc get daemonset -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)"
if [[ -n "$daemon_status" || -n "$pod_status" ]] ; then
echo "Waiting until GPU Pods or Daemonset of '$pod_label' in namespace '$namespace' are in running state..."
echo "Pods status: '$pod_status'"
echo "Daemonset status: '$daemon_status'"
oc wait --timeout="${timeout_seconds}s" --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
oc rollout status --watch --timeout=3m daemonset -n "$namespace" -l app="$pod_label" || continue
break
fi
echo "Waiting for Pods or Daemonset with label app='$pod_label' in namespace '$namespace' to be present..."
sleep 5
done
}

function machineconfig_updates {
# There should be only "True" and there should be at least one
[ True = "$(oc get machineconfigpool --no-headers=true '-o=custom-columns=UPDATED:.status.conditions[?(@.type=="Updated")].status' | uniq)" ]
}

function monitor_logs() {
local pod_name=$1
local search_text=$2
local ns=$3
local c_name=$4
echo "Monitoring logs for pod $pod_name..."

# Use 'kubectl logs' command to fetch logs continuously

oc logs "$pod_name" -c "$c_name" -n "$ns" | while read -r line; do
if [[ $line == *"$search_text"* ]]; then
echo "Found \"$search_text\" in pod logs: $line"
fi
done
}

check_registry
status=$?

# Blacklist the inbox drivers with a MachineConfig if the registry check was successful
if [ $status -eq 0 ]; then
oc apply -f "$GPU_INSTALL_DIR/blacklist_driver.yaml"
else
return 1
fi

sleep 120
wait_while 1800 ! machineconfig_updates

echo "Installing NFD operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
wait_while 360 ! has_csv_succeeded openshift-nfd nfd
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
echo "Installing KMM operator"
oc apply -f "$GPU_INSTALL_DIR/kmm_operator_install.yaml"
wait_while 360 ! has_csv_succeeded openshift-kmm kernel-module-management
echo "Installing AMD operator"
oc apply -f "$GPU_INSTALL_DIR/amd_gpu_install.yaml"
wait_while 360 ! has_csv_succeeded openshift-amd-gpu amd-gpu-operator
create_devconfig
name=$(oc get pod -n openshift-amd-gpu -l openshift.io/build.name -oname)
wait_while 1200 ! monitor_logs "$name" "Successfully pushed image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu" openshift-amd-gpu docker-build
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfig
metadata:
labels:
machineconfiguration.openshift.io/role: worker
name: amdgpu-module-blacklist
spec:
config:
ignition:
version: 3.2.0
storage:
files:
- path: "/etc/modprobe.d/amdgpu-blacklist.conf"
mode: 420
overwrite: true
contents:
source: "data:text/plain;base64,YmxhY2tsaXN0IGFtZGdwdQo="
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: v1
kind: Namespace
metadata:
name: openshift-kmm

---

apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
name: openshift-kmm-operator-group
namespace: openshift-kmm
tarukumar marked this conversation as resolved.
Show resolved Hide resolved
spec: {}

---
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
name: kernel-module-management
namespace: openshift-kmm
spec:
channel: stable
installPlanApproval: Automatic
name: kernel-module-management
source: redhat-operators
sourceNamespace: openshift-marketplace
4 changes: 2 additions & 2 deletions ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@

CHANNEL="$(oc get packagemanifest gpu-operator-certified -n openshift-marketplace -o jsonpath='{.status.defaultChannel}')"

CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketplace -o json | jq -r '.status.channels[] | select(.name == "'$CHANNEL'") | .currentCSV')"

Check warning on line 10 in ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh

View workflow job for this annotation

GitHub Actions / shellcheck linter

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. [SC2086](https://github.com/koalaman/shellcheck/wiki/SC2086) Raw Output: ./ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh:10:140:info:Double quote to prevent globbing and word splitting. [SC2086](https://github.com/koalaman/shellcheck/wiki/SC2086)

sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f "$GPU_INSTALL_DIR/nfd_operator.yaml"
tarukumar marked this conversation as resolved.
Show resolved Hide resolved
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub nfd
oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified

Expand Down
2 changes: 1 addition & 1 deletion ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: nfd.openshift.io/v1
kind: NodeFeatureDiscovery
metadata:
name: nfd-instance
namespace: nvidia-gpu-operator
namespace: openshift-nfd
spec:
instance: "" # instance is empty by default
topologyupdater: false # False by default
Expand Down
29 changes: 29 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: v1
kind: Namespace
metadata:
name: openshift-nfd

---
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
name: openshift-nfd-og
namespace: openshift-nfd
spec:
targetNamespaces:
- openshift-nfd
upgradeStrategy: Default


---
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
name: nfd
namespace: openshift-nfd
spec:
channel: "stable"
installPlanApproval: Automatic
name: nfd
source: redhat-operators
sourceNamespace: openshift-marketplace
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
SOURCE_MACHINESET=$(oc get machineset -n openshift-machine-api -o name | head -n1)

# Reformat with jq, for better diff result.
oc get -o json -n openshift-machine-api $SOURCE_MACHINESET | jq -r > /tmp/source-machineset.json

Check warning on line 19 in ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh

View workflow job for this annotation

GitHub Actions / shellcheck linter

[shellcheck] reported by reviewdog 🐶 Double quote to prevent globbing and word splitting. [SC2086](https://github.com/koalaman/shellcheck/wiki/SC2086) Raw Output: ./ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh:19:41:info:Double quote to prevent globbing and word splitting. [SC2086](https://github.com/koalaman/shellcheck/wiki/SC2086)

OLD_MACHINESET_NAME=$(jq '.metadata.name' -r /tmp/source-machineset.json )
NEW_MACHINESET_NAME=${OLD_MACHINESET_NAME/worker/gpu}
Expand All @@ -28,7 +28,6 @@
| del(.metadata.uid)
| del(.metadata.creationTimestamp)
| del(.metadata.resourceVersion)
| .spec.template.spec.taints += [{"effect": "NoSchedule" , "key": "nvidia.com/gpu" , "value": "None"}]
' /tmp/source-machineset.json > /tmp/gpu-machineset.json

# Change machineset name
Expand Down
Loading