-
Notifications
You must be signed in to change notification settings - Fork 113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Define a new EDPM role for installing nvidia driver on nodes #2637
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
--- | ||
# Copyright 2024 Red Hat, Inc. | ||
# All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
# not use this file except in compliance with the License. You may obtain | ||
# a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
# License for the specific language governing permissions and limitations | ||
# under the License. | ||
|
||
|
||
- name: Gather the list of EDPM computes | ||
hosts: "{{ cifmw_target_hook_host | default('localhost') }}" | ||
gather_facts: false | ||
tasks: | ||
- name: Fetch OSP BMO nodesets | ||
environment: | ||
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" | ||
PATH: "{{ cifmw_path }}" | ||
ansible.builtin.command: | ||
cmd: >- | ||
oc get OpenStackBaremetalSet -n "{{ namespace|default('openstack') }}" -o yaml | ||
register: _osp_bmo_nodsets_oc_out | ||
|
||
- name: Craft the BM hosts list | ||
ansible.builtin.set_fact: | ||
_bmo_provisioned_hosts: >- | ||
{% set hosts = [] -%} | ||
{% set nodesets = (_osp_bmo_nodsets_oc_out.stdout | from_yaml)['items'] | default([]) -%} | ||
{% for spec in nodesets | map(attribute='spec') -%} | ||
{% for host_key, host_val in spec.baremetalHosts.items() -%} | ||
{% set _ = hosts.append( | ||
{ | ||
'name': host_key, | ||
'ip': host_val['ctlPlaneIP'] | ansible.utils.ipaddr('address'), | ||
'user': spec.cloudUserName, | ||
'group': host_key | split('-') | first + 's' | ||
}) -%} | ||
{% endfor -%} | ||
{% endfor -%} | ||
{{ hosts }} | ||
|
||
- name: Add OSP BMO nodesets to Ansible | ||
ansible.builtin.add_host: | ||
name: "{{ item.name }}" | ||
groups: "{{ item.group }}" | ||
ansible_ssh_user: "{{ item.user }}" | ||
ansible_host: "{{ item.ip }}" | ||
ansible_ssh_private_key_file: "{{ ansible_user_dir }}/.ssh/id_cifw" | ||
ansible_ssh_extra_args: '-o StrictHostKeyChecking=no' | ||
loop: "{{ _bmo_provisioned_hosts }}" | ||
|
||
- name: Wait for the instance to boot | ||
delegate_to: "{{ item.name }}" | ||
ansible.builtin.wait_for_connection: | ||
sleep: 2 | ||
timeout: 600 | ||
loop: "{{ _bmo_provisioned_hosts }}" | ||
|
||
- name: Run the Nvidia phase 1 role | ||
hosts: edpms | ||
tasks: | ||
- name: Run phase1 | ||
ansible.builtin.import_role: | ||
name: edpm_nvidia_mdev_prepare | ||
# As a reminder, at the end of phase1, the compute will reboot | ||
tasks_from: phase1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
--- | ||
# Copyright 2024 Red Hat, Inc. | ||
# All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
# not use this file except in compliance with the License. You may obtain | ||
# a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
# License for the specific language governing permissions and limitations | ||
# under the License. | ||
|
||
|
||
- name: Gather the list of EDPM computes | ||
hosts: "{{ cifmw_target_hook_host | default('localhost') }}" | ||
gather_facts: false | ||
tasks: | ||
- name: Fetch OSP BMO nodesets | ||
environment: | ||
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" | ||
PATH: "{{ cifmw_path }}" | ||
ansible.builtin.command: | ||
cmd: >- | ||
oc get OpenStackBaremetalSet -n "{{ namespace|default('openstack') }}" -o yaml | ||
register: _osp_bmo_nodsets_oc_out | ||
|
||
- name: Craft the BM hosts list | ||
ansible.builtin.set_fact: | ||
_bmo_provisioned_hosts: >- | ||
{% set hosts = [] -%} | ||
{% set nodesets = (_osp_bmo_nodsets_oc_out.stdout | from_yaml)['items'] | default([]) -%} | ||
{% for spec in nodesets | map(attribute='spec') -%} | ||
{% for host_key, host_val in spec.baremetalHosts.items() -%} | ||
{% set _ = hosts.append( | ||
{ | ||
'name': host_key, | ||
'ip': host_val['ctlPlaneIP'] | ansible.utils.ipaddr('address'), | ||
'user': spec.cloudUserName, | ||
'group': host_key | split('-') | first + 's' | ||
}) -%} | ||
{% endfor -%} | ||
{% endfor -%} | ||
{{ hosts }} | ||
|
||
- name: Add OSP BMO nodesets to Ansible | ||
ansible.builtin.add_host: | ||
name: "{{ item.name }}" | ||
groups: "{{ item.group }}" | ||
ansible_ssh_user: "{{ item.user }}" | ||
ansible_host: "{{ item.ip }}" | ||
ansible_ssh_private_key_file: "{{ ansible_user_dir }}/.ssh/id_cifw" | ||
ansible_ssh_extra_args: '-o StrictHostKeyChecking=no' | ||
loop: "{{ _bmo_provisioned_hosts }}" | ||
|
||
- name: Wait for the instance to boot | ||
delegate_to: "{{ item.name }}" | ||
ansible.builtin.wait_for_connection: | ||
sleep: 2 | ||
timeout: 600 | ||
loop: "{{ _bmo_provisioned_hosts }}" | ||
|
||
- name: Run the Nvidia phase 2 role | ||
hosts: edpms | ||
tasks: | ||
- name: Run phase 2 | ||
ansible.builtin.import_role: | ||
name: edpm_nvidia_mdev_prepare | ||
tasks_from: phase2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# edpm_nvidia_mdev_prepare | ||
|
||
This Ansible role prepares an EDPM node by configuring it for nvidia mediated | ||
devices usage by installing from an external location a RPM package containing | ||
the nvidia driver. | ||
|
||
## Privilege escalation | ||
`become` will be set to True during the phase1 task in order to blacklist | ||
the nouveau driver, regenerate the initramfs and then install the nvidia RPM | ||
package. During phase2, we'll also create system files. | ||
|
||
## Parameters | ||
|
||
* `cifmw_edpm_nvidia_mdev_prepare_disable_nouveau`: (boolean) Whether to disable the `nouveau` kernel driver. Default: `true`. | ||
|
||
* `cifmw_edpm_nvidia_mdev_prepare_driver_url`: (string) The location of the proprietary nvidia driver RPM package so that we can install it. Can be any URI supported by DNF. | ||
|
||
* `cifmw_edpm_nvidia_mdev_prepare_package_name`: (string) The installed package name, which can't be inferred from the package name, usually set as `NVIDIA-vGPU-rhel`. | ||
|
||
* `cifmw_edpm_nvidia_mdev_prepare_sriov_devices`: (list) List of PCI addresses corresponding to the nvidia physical SR-IOV GPUs that require virtual functions creation. Leave it defaulted to ALL if you want to enable SR-IOV VFs for all your nvidia GPUs. | ||
|
||
# Should we start the service for automatically creating the VFs ? | ||
* `cifmw_edpm_nvidia_mdev_prepare_sriov_manage_start`: (boolean) Whether you want the virtual functions to be created upon reboot. Default: `true`. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
--- | ||
# Copyright Red Hat, Inc. | ||
# All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
# not use this file except in compliance with the License. You may obtain | ||
# a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
# License for the specific language governing permissions and limitations | ||
# under the License. | ||
|
||
# Does the OS needs to disable the nouveau driver ? | ||
cifmw_edpm_nvidia_mdev_prepare_disable_nouveau: true | ||
|
||
# What will be the name of the nvidia package ? | ||
cifmw_edpm_nvidia_mdev_prepare_package_name: "NVIDIA-vGPU-rhel" | ||
|
||
# Which SR-IOV GPU devices should be creating VFs ? | ||
cifmw_edpm_nvidia_mdev_prepare_sriov_devices: | ||
- ALL | ||
|
||
# Should we start the service for automatically creating the VFs ? | ||
cifmw_edpm_nvidia_mdev_prepare_sriov_manage_start: true |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please delete |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
[Unit] | ||
After = nvidia-vgpu-mgr.service | ||
After = nvidia-vgpud.service | ||
Description = Enable Nvidia GPU virtual functions | ||
|
||
[Service] | ||
Type = oneshot | ||
User = root | ||
Group = root | ||
ExecStart = /usr/lib/nvidia/sriov-manage -e %i | ||
# Give a reasonable amount of time for the server to start up/shut down | ||
TimeoutSec = 120 | ||
# This creates a specific slice which all services will operate from | ||
# The accounting options give us the ability to see resource usage | ||
# through the `systemd-cgtop` command. | ||
Slice = system.slice | ||
# Set Accounting | ||
CPUAccounting = True | ||
BlockIOAccounting = True | ||
MemoryAccounting = True | ||
TasksAccounting = True | ||
RemainAfterExit = True | ||
ExecStartPre = /usr/bin/sleep 30 | ||
|
||
[Install] | ||
WantedBy = multi-user.target |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please delete |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
--- | ||
# Copyright Red Hat, Inc. | ||
# All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
# not use this file except in compliance with the License. You may obtain | ||
# a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
# License for the specific language governing permissions and limitations | ||
# under the License. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
--- | ||
# Copyright Red Hat, Inc. | ||
# All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
# not use this file except in compliance with the License. You may obtain | ||
# a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
# License for the specific language governing permissions and limitations | ||
# under the License. | ||
|
||
|
||
galaxy_info: | ||
author: CI Framework | ||
description: CI Framework Role -- edpm_nvidia_mdev_prepare | ||
company: Red Hat | ||
license: Apache-2.0 | ||
min_ansible_version: "2.14" | ||
namespace: cifmw | ||
galaxy_tags: | ||
- cifmw | ||
|
||
# List your role dependencies here, one per line. Be sure to remove the '[]' above, | ||
# if you add dependencies to this list. | ||
dependencies: [] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
--- | ||
# Copyright Red Hat, Inc. | ||
# All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
# not use this file except in compliance with the License. You may obtain | ||
# a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
# License for the specific language governing permissions and limitations | ||
# under the License. | ||
|
||
|
||
- name: Converge | ||
hosts: all | ||
vars: | ||
cifmw_edpm_nvidia_mdev_prepare_driver_url: 'tmux' | ||
cifmw_edpm_nvidia_mdev_prepare_sriov_manage_start: false | ||
tasks: | ||
- name: Run phase1 | ||
ansible.builtin.import_role: | ||
name: edpm_nvidia_mdev_prepare | ||
tasks_from: phase1 | ||
|
||
- name: Check expected files in phase 1 | ||
ansible.builtin.stat: | ||
path: "{{ item }}" | ||
loop: | ||
- "/etc/modprobe.d/blacklist-nouveau.conf" | ||
register: phase1_files | ||
|
||
- name: Check if expected files where created | ||
ansible.builtin.assert: | ||
that: item.stat.exists | ||
loop: "{{ phase1_files.results }}" | ||
|
||
- name: Check that tmux was installed | ||
ansible.builtin.command: tmux lscm | ||
register: tmux_id | ||
failed_when: tmux_id.rc != 0 | ||
|
||
- name: Run phase 2 | ||
ansible.builtin.import_role: | ||
name: edpm_nvidia_mdev_prepare | ||
tasks_from: phase2 | ||
|
||
- name: Check expected files in phase 2 | ||
ansible.builtin.stat: | ||
path: "{{ item }}" | ||
loop: | ||
- "/etc/systemd/system/[email protected]" | ||
register: phase2_files | ||
|
||
- name: Check if expected files where created | ||
ansible.builtin.assert: | ||
that: item.stat.exists | ||
loop: "{{ phase2_files.results }}" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
--- | ||
# Mainly used to override the defaults set in .config/molecule/ | ||
# By default, it uses the "config_podman.yml" - in CI, it will use | ||
# "config_local.yml". | ||
log: true | ||
|
||
provisioner: | ||
name: ansible | ||
log: true | ||
env: | ||
ANSIBLE_STDOUT_CALLBACK: yaml |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
--- | ||
# Copyright Red Hat, Inc. | ||
# All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
# not use this file except in compliance with the License. You may obtain | ||
# a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
# License for the specific language governing permissions and limitations | ||
# under the License. | ||
|
||
|
||
- name: Prepare | ||
hosts: all | ||
roles: | ||
- role: test_deps |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please delete |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
--- | ||
# Copyright Red Hat, Inc. | ||
# All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
# not use this file except in compliance with the License. You may obtain | ||
# a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
# License for the specific language governing permissions and limitations | ||
# under the License. | ||
|
||
- name: Cleaning the World | ||
ansible.builtin.debug: | ||
msg: "So here edpm_nvidia_mdev_prepare should clean things up!" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.