redhat-performance · radez · Feb 5, 2025 · akrzos · Feb 25, 2025 · akrzos
diff --git a/ansible/mno-scale-out.yml b/ansible/mno-scale-out.yml
@@ -0,0 +1,15 @@
+---
+- name: Adds nodes to a cluster
+  hosts: bastion
+  vars_files:
+  - vars/scale_out.yml
+  - vars/all.yml
+  roles:
+  - mno-scale-out
+  - role: boot-iso
+    vars:
+      inventory_group: 'worker'
+      offset: "{{ current_worker_count }}"
+      index: "{{ current_worker_count+scale_out_count }}"
+      virtual_media_iso: "mno-scale-out.x86_64.iso"
+  - mno-scale-out-csr
diff --git a/ansible/roles/boot-iso/defaults/main.yml b/ansible/roles/boot-iso/defaults/main.yml
@@ -4,3 +4,8 @@
 # This will be your bastion machine (if you run setup-bastion playbook)
 http_store_host: "{{ groups['bastion'][0] }}"
 http_store_port: 8081
+
+# This is always 0 for a deploy
+# For a scale out it indicates how many worker nodes are already deployed
+# and should not be included in the scale out from the inventory.
+offset: 0
diff --git a/ansible/roles/boot-iso/tasks/dell.yml b/ansible/roles/boot-iso/tasks/dell.yml
@@ -1,6 +1,10 @@
 ---
 # Dell tasks for booting an iso
 
+- name: Set Virtual Media ISO
+  set_fact:
+    _virtual_media_iso: "{{ virtual_media_iso | default(hostvars[item]['boot_iso']) }}"
+
 - name: Dell - Power down machine prior to booting iso
   shell: |
     ipmitool -I lanplus -H {{ hostvars[item]['bmc_address'] }} -U {{ hostvars[item]['bmc_user'] }} -P {{ hostvars[item]['bmc_password'] }} chassis power off
@@ -80,7 +84,7 @@
     username: "{{ hostvars[item]['bmc_user'] }}"
     password: "{{ hostvars[item]['bmc_password'] }}"
     virtual_media:
-      image_url: "http://{{ http_store_host }}:{{ http_store_port }}/{{ hostvars[item]['boot_iso'] }}"
+      image_url: "http://{{ http_store_host }}:{{ http_store_port }}/{{ _virtual_media_iso }}"
       media_types:
       - CD
       - DVD

diff --git a/ansible/roles/boot-iso/tasks/hp.yml b/ansible/roles/boot-iso/tasks/hp.yml
@@ -1,6 +1,10 @@
 ---
 # HP tasks for booting an iso
 
+- name: Set Virtual Media ISO
+  set_fact:
+    _virtual_media_iso: "{{ virtual_media_iso | default(hostvars[item]['boot_iso']) }}"
+
 - name: HP - Power off
   hpilo_boot:
     host: "{{ hostvars[item]['bmc_address'] }}"
@@ -20,5 +24,5 @@
     host: "{{ hostvars[item]['bmc_address'] }}"
     login: "{{ hostvars[item]['bmc_user'] }}"
     password: "{{ hostvars[item]['bmc_password'] }}"
-    image: "http://{{ http_store_host }}:{{ http_store_port }}/{{ hostvars[item]['boot_iso'] }}"
+    image: "http://{{ http_store_host }}:{{ http_store_port }}/{{ _virtual_media_iso }}"
     media: cdrom
diff --git a/ansible/roles/boot-iso/tasks/lenovo.yml b/ansible/roles/boot-iso/tasks/lenovo.yml
@@ -1,6 +1,10 @@
 ---
 # Lenovo tasks for booting an iso
 
+- name: Set Virtual Media ISO
+  set_fact:
+    _virtual_media_iso: "{{ virtual_media_iso | default(hostvars[item]['boot_iso']) }}"
+
 - name: Lenovo - Power off
   community.general.redfish_command:
     category: Systems
@@ -26,7 +30,7 @@
     username: "{{ hostvars[item]['bmc_user'] }}"
     password: "{{ hostvars[item]['bmc_password'] }}"
     virtual_media:
-      image_url: "http://{{ http_store_host }}:{{ http_store_port }}/iso/{{ hostvars[item]['boot_iso'] }}"
+      image_url: "http://{{ http_store_host }}:{{ http_store_port }}/iso/{{ _virtual_media_iso }}"
       media_types:
         - CD
         - DVD

diff --git a/ansible/roles/boot-iso/tasks/libvirt.yml b/ansible/roles/boot-iso/tasks/libvirt.yml
@@ -3,6 +3,10 @@
 # Couldn't use ansible redfish_command it requires username and password to be used.
 # URLs modeled from http://docs.openstack.org/sushy-tools/latest/user/dynamic-emulator.html
 
+- name: Set Virtual Media ISO
+  set_fact:
+    _virtual_media_iso: "{{ virtual_media_iso | default(hostvars[item]['boot_iso']) }}"
+
 - name: Libvirt - Power down machine prior to booting iso
   uri:
     url: "http://{{ hostvars[item]['ansible_host'] }}:9000/redfish/v1/Systems/{{ hostvars[item]['domain_uuid'] }}/Actions/ComputerSystem.Reset"
@@ -65,7 +69,7 @@
     headers:
       content-type: application/json
       Accept: application/json
-    body: {"Image":"http://{{ http_store_host }}:{{ http_store_port }}/{{ hostvars[item]['boot_iso'] }}", "Inserted": true}
+    body: {"Image":"http://{{ http_store_host }}:{{ http_store_port }}/{{ _virtual_media_iso }}", "Inserted": true}
     body_format: json
     validate_certs: no
     status_code: 204

diff --git a/ansible/roles/boot-iso/tasks/main.yml b/ansible/roles/boot-iso/tasks/main.yml
@@ -4,29 +4,29 @@
 - name: Boot iso on dell hardware
   include_tasks: dell.yml
   with_items:
-    - "{{ groups[inventory_group][:index|int] }}"
+    - "{{ groups[inventory_group][offset:index|int] }}"
   when: hostvars[item]['vendor'] == 'Dell'
 
 - name: Boot iso on hp hardware
   include_tasks: hp.yml
   with_items:
-    - "{{ groups[inventory_group][:index|int] }}"
+    - "{{ groups[inventory_group][offset:index|int] }}"
   when: hostvars[item]['vendor'] == 'Hp'
 
 - name: Boot iso on supermicro hardware
   include_tasks: supermicro.yml
   with_items:
-    - "{{ groups[inventory_group][:index|int] }}"
+    - "{{ groups[inventory_group][offset:index|int] }}"
   when: hostvars[item]['vendor'] == 'Supermicro'
 
 - name: Boot iso on lenovo hardware
   include_tasks: lenovo.yml
   with_items:
-    - "{{ groups[inventory_group][:index|int] }}"
+    - "{{ groups[inventory_group][offset:index|int] }}"
   when: hostvars[item]['vendor'] == 'Lenovo'
 
 - name: Boot iso on libvirt vm
   include_tasks: libvirt.yml
   with_items:
-    - "{{ groups[inventory_group][:index|int] }}"
+    - "{{ groups[inventory_group][offset|int:index|int] }}"
   when: hostvars[item]['vendor'] == 'Libvirt'
diff --git a/ansible/roles/boot-iso/tasks/supermicro.yml b/ansible/roles/boot-iso/tasks/supermicro.yml
@@ -1,6 +1,10 @@
 ---
 # Supermicro tasks for booting an iso
 
+- name: Set Virtual Media ISO
+  set_fact:
+    _virtual_media_iso: "{{ virtual_media_iso | default(hostvars[item]['boot_iso']) }}"
+
 - name: SuperMicro - Power off
   community.general.redfish_command:
     category: Systems
@@ -16,7 +20,7 @@
 # Retry because sometimes mounting will fail if it occurs too quickly after unmounting
 - name: SuperMicro - Mount ISO
   shell: |
-    SMCIPMITool {{ hostvars[item]['bmc_address'] }} {{ hostvars[item]['bmc_user'] }} {{ hostvars[item]['bmc_password'] }} wsiso mount "http://{{ http_store_host }}:{{ http_store_port }}" /iso/{{ hostvars[item]['boot_iso'] }}
+    SMCIPMITool {{ hostvars[item]['bmc_address'] }} {{ hostvars[item]['bmc_user'] }} {{ hostvars[item]['bmc_password'] }} wsiso mount "http://{{ http_store_host }}:{{ http_store_port }}" /iso/{{ _virtual_media_iso }}
   register: mount_iso
   until: not mount_iso.failed
   retries: 10

diff --git a/ansible/roles/mno-scale-out-csr/defaults/main.yml b/ansible/roles/mno-scale-out-csr/defaults/main.yml
@@ -0,0 +1,2 @@
+---
+scale_out_count: 0
diff --git a/ansible/roles/mno-scale-out-csr/tasks/check_nodes_joined.yml b/ansible/roles/mno-scale-out-csr/tasks/check_nodes_joined.yml
@@ -0,0 +1,57 @@
+---
+- name: Set Facts to recurse with
+  set_fact:
+    r_qry: "{{ qry }}"
+    r_worker_counter: "{{ worker_counter }}"
+
+- name: approve CSRs and check if nodes have joined the cluster
+  block:
+    - name: Increment the retry count
+      set_fact:
+        retry: "{{ 0 if retry is undefined else retry | int + 1 }}"
+
+    - name: Pause during loop
+      pause:
+        seconds: "30"
+      when: retry|int > 0
+
+    - name: Get CSRs
+      shell: |
+        KUBECONFIG={{ bastion_cluster_config_dir }}/kubeconfig oc get csr -o json
+      register: oc_get_csr
+
+    - name: Approve pending CSRs
+      shell: |
+        KUBECONFIG={{ bastion_cluster_config_dir }}/kubeconfig oc adm certificate approve {{ item.metadata.name }}
+      with_items: "{{ oc_get_csr.stdout | from_json | json_query(qry) }}"
+      loop_control:
+        label: "{{ item.metadata.name }}"
+
+    - name: Get worker node count
+      shell: |
+        KUBECONFIG={{ bastion_cluster_config_dir }}/kubeconfig oc get nodes | {{ worker_counter }}
+      register: oc_get_nodes_workers
+
+    - name: Current Worker Node Count
+      debug:
+        var: oc_get_nodes_workers.stdout
+
+    - name: Waiting for Worker Node Count
+      debug:
+        msg: "{{ current_worker_count+scale_out_count }}"
+
+    - name: Raise fail to trigger retry if all nodes didn't meet requirments
+      fail:
+        msg: All nodes have not met check requirements
+      when: oc_get_nodes_workers.stdout|int < current_worker_count+scale_out_count
+  rescue:
+    - name: Fail on maximum retry count
+      fail:
+        msg: Maximum retries reached
+      when: retry | int == 540
+
+    - name: Retry the check
+      include_tasks: check_nodes_joined.yml
+      vars:
+        qry: "{{ r_qry }}"
+        worker_counter: "{{ r_worker_counter }}"
diff --git a/ansible/roles/mno-scale-out-csr/tasks/main.yml b/ansible/roles/mno-scale-out-csr/tasks/main.yml
@@ -0,0 +1,12 @@
+---
+- name: Approve node-bootstrapper CSRs and wait for nodes to join cluster
+  include_tasks: check_nodes_joined.yml
+  vars:
+    qry: "items[?status.conditions==null && spec.username == 'system:serviceaccount:openshift-machine-config-operator:node-bootstrapper']"
+    worker_counter: "grep -c worker"
+
+- name: Approve Kublet-serving CSRs and wait for nodes to join cluster
+  include_tasks: check_nodes_joined.yml
+  vars:
+    qry: "items[?status.conditions==null && spec.signerName == 'kubernetes.io/kubelet-serving']"
+    worker_counter: "grep worker | grep -c -v NotReady"
diff --git a/ansible/roles/mno-scale-out/defaults/main.yml b/ansible/roles/mno-scale-out/defaults/main.yml
@@ -0,0 +1,2 @@
+---
+scale_out_count: 0
diff --git a/ansible/roles/mno-scale-out/tasks/main.yml b/ansible/roles/mno-scale-out/tasks/main.yml
@@ -0,0 +1,33 @@
+---
+- name: Delete mno-scale-out directory
+  ansible.builtin.file:
+    state: absent
+    path: /root/mno-scale-out
+
+- name: Create mno-scale-out directory
+  ansible.builtin.file:
+    state: directory
+    path: /root/mno-scale-out
+
+- name: Template Scaleout Nodes Config
+  template:
+    src: nodes-config.yml.j2
+    dest: /root/mno-scale-out/nodes-config.yaml
+  vars:
+    workers: "{{ groups['worker'][current_worker_count:current_worker_count+scale_out_count] }}"
+
+- name: Add Nodes to cluster and generate boot iso (Takes a min or two)
+  command: oc adm node-image create --kubeconfig {{ bastion_cluster_config_dir }}/kubeconfig 
+  args:
+    chdir: /root/mno-scale-out/
+
+- name: Copy scale out discovery iso to http server
+  ansible.builtin.copy:
+    src: /root/mno-scale-out/node.x86_64.iso
+    dest: /opt/http_store/data/mno-scale-out.x86_64.iso
+
+- name: Delete mno-scale-out directory
+  ansible.builtin.file:
+    state: absent
+    path: /root/mno-scale-out
+
diff --git a/ansible/roles/mno-scale-out/templates/nodes-config.yml.j2 b/ansible/roles/mno-scale-out/templates/nodes-config.yml.j2
@@ -0,0 +1,44 @@
+---
+hosts:
+{% for worker in workers %}
+- hostname: {{ worker }}
+  rootDeviceHints:
+   deviceName: {{ hostvars[worker].install_disk }}
+  interfaces:
+{% if hostvars[worker].lab_mac | default(False) %}
+   - macAddress: {{ hostvars[worker].lab_mac }}
+     name: {{ hostvars[worker].lab_interface }}
+{% endif %}
+   - macAddress: {{ hostvars[worker].mac_address }}
+     name: {{ hostvars[worker].network_interface }}
+  networkConfig:
+   interfaces:
+     - name: {{ hostvars[worker].network_interface }}
+       type: ethernet
+       state: up
+       mac-address: {{ hostvars[worker].mac_address }}
+       ipv4:
+         enabled: true
+         address:
+           - ip: {{ hostvars[worker].ip }}
+             prefix-length: {{ hostvars[worker].network_prefix }}
+         auto-dns: false
+{% if hostvars[worker].lab_mac | default(False) %}
+     - name: {{ hostvars[worker].lab_interface }}
+       type: ethernet
+       state: up
+       mac-address: {{ hostvars[worker].lab_interface }}
+       ipv4:
+         enabled: false
+         auto-dns: false
+{% endif %}
+   dns-resolver:
+     config:
+       server:
+       - {{ hostvars[worker].dns1 }}
+   routes:
+     config:
+     - destination: 0.0.0.0/0
+       next-hop-address: {{ hostvars[worker].gateway }}
+       next-hop-interface: eth0
+{% endfor %}
diff --git a/ansible/vars/scale_out.yml b/ansible/vars/scale_out.yml
@@ -0,0 +1,11 @@
+---
+# Set this to the number of already deployed worker nodes
+# This assumes they are all listed in the worker inventory
+# group. This varable is an offset used to skip worker node
+# records in the worker inventory group.
+current_worker_count: 120
+
+# Set this to the number of worker nodes being added to the
+# cluster. At minimum, current_worker_count + scale_out_count
+# inventory records must exist in the inventory file.
+scale_out_count: 100
diff --git a/docs/scale-out-mno.md b/docs/scale-out-mno.md
@@ -0,0 +1,49 @@
+# Scale Out a Multi-Node Openshift Deployment
+
+A JetLag deployed Multi-Node Openshift deployment can be scale out via JetLag. Workers can be added using JetLag Inventory and Playbooks. This guide assumes you have an existing OCP cluster deployed via JetLag. The worker section in the JetLag inventory file should contain records that represent the worker nodes currently joined to the running cluster.
+
+_**Steps to Scale Out:**_
+- [Add New Node Entries to Worker Inventory](#add-new-node-entries-to-worker-inventory)
+- [Update scale_out.yml](#update-scale_out.yml)
+- [Run mno-scale-out.yml](#run-mno-scale-out.yml)
+
+## Add Nodes to Worker Inventory
+To add new node entries to the worker inventory there are three potential options.
+
+1. New bare metal nodes added to cloud assignment
+
+	If more nodes were added to your lab assigment, update worker_node_count in the ansible/vars/all.yml file and rerun the create-inventory playbook. Be sure to compare the previous inventory file to the new one to ensure that everything is the same except the new nodes added to the worker section.
+
+2. New virtual nodes from hv_vm inventory
+
+	If you have enabled and setup hv_inventory nodes/and VMs you can use these VMs as new nodes to scale out workers. Records to represent the VMs you would like to scale to must be added to the end of the worker section. If you have not used any of the VMs for anything other than scale out you can simply increase the hybrid_worker_count value in ansible/vars/all.yml and rerun the create-inventory playbook. Be sure that the appropriate playbooks have been run to ensure the VM entries being added to the worker inventory section have been created on the hv nodes. Careful, the playbooks to manupulate VMs operate on all hv_vm records at one time. Be sure to compare the previous inventory file to the new one to ensure that everything is the same except the new nodes added to the worker section.
+
+3. Manual Entry
+
+	You can add new entries to the worker inventory section manually. Place them at the end of the list of worker entries.
+
+The new nodes, baremetal or virtual, must be placed at the end of the worker nodes inventory. The scale out playbook is designed to use the last n nodes in the inventory.
+
+## Update scale_out.yml
+There are two variables in ansible/vars/scale_out.yml that indicate which entries from the worker inventory section should be added to the existing cluster.
+
+- current_worker_count: This value indicates the number of entries in the worker inventory section to skip before starting to add nodes to the existing cluster. This number should match the current number of worker nodes associated with the existing cluster.
+- scale_out_count: This value indicates the number of entries in the worker inventory section that will be added as new workers to the existing cluster.
+
+Example: If the initial OCP deployment had three baremetal workers and the intended worker count was ten, current_worker_count would be 3 and scale_out_count would be 7. Scale out from three existing workers, adding seven new workers, for a total of ten worker nodes.
+
+## Run mno-scale-out.yml
+Once the new worker records are added to the inventory and the scale_out.yml file has the proper values. The final step is to run the mno-scale-out.yml playbook.
+
+```console
+(.ansible) [root@xxx-h01-000-r650 jetlag]# ansible-playbook -i ansible/inventory/cloud99.local ansible/mno-scale-out.yml
+...
+```
+
+This playbook will:
+- Generate node configuration yml
+- Invoke ```oc adm node-image create``` with the node configuration, which generates a discovery ISO
+- Boot the new worker nodes off of the generated discovery ISO
+- Approved generated CSRs
+
+This workflow can be run repeatedly to add more workers to the existing cluster.