Incorporate separate StackHPC cloud tests for monitoring and for hosts (

#1501) CI: Add Grafana and OpenSearch Dashboards variables for SOT Depends-On: stackhpc/stackhpc-cloud-tests#3 Depends-On: stackhpc/stackhpc-cloud-tests#4 Co-authored-by: Mark Goddard <[email protected]>
stackhpc · Feb 18, 2025 · f6cd436 · f6cd436
1 parent cd0d44e
commit f6cd436
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 20 deletions.
diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml
@@ -468,7 +468,7 @@ jobs:
             -v $(pwd)/sct-results:/stack/sct-results \
             -e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \
             $KAYOBE_IMAGE \
-            /stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/stackhpc-cloud-tests.yml' -e sot_version=${{ inputs.stackhpc_cloud_tests_version }}
+            /stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/stackhpc-cloud-tests.yml' -e sct_version=${{ inputs.stackhpc_cloud_tests_version }}
         env:
           KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
 
@@ -496,16 +496,20 @@ jobs:
             sct-results/
         if: ${{ !cancelled() && (steps.tempest.outcome == 'success' || steps.stackhpc-cloud-tests.outcome == 'success' || steps.diagnostics.outcome == 'success') }}
 
-      - name: Fail if any Tempest tests failed
+      - name: Fail if any tests failed
         run: |
-          test $(wc -l < tempest-artifacts/failed-tests) -lt 1
-
-      - name: Fail if any StackHPC Cloud tests failed
-        run: |
-          echo "Some StackHPC Cloud tests failed."
-          echo "See HTML results artifact (sct-results) for details."
-          exit 1
-        if: steps.stackhpc-cloud-tests.outcome == 'failure'
+          rc=0
+          if [[ $(wc -l < tempest-artifacts/failed-tests) -ne 0 ]]; then
+            echo "Some Tempest tests failed."
+            echo "See HTML results artifact (tempest-artifacts) for details."
+            rc=1
+          fi
+          if [[ $(wc -l < sct-results/failed-tests) -ne 0 ]]; then
+            echo "Some StackHPC Cloud tests failed."
+            echo "See HTML results artifact (sct-results) for details."
+            rc=1
+          fi
+          exit $rc
 
       - name: Destroy
         run: terraform destroy -auto-approve

diff --git a/etc/kayobe/ansible/stackhpc-cloud-tests.yml b/etc/kayobe/ansible/stackhpc-cloud-tests.yml
@@ -1,17 +1,24 @@
 ---
 - name: Run StackHPC Cloud tests
-  hosts: tempest_runner
+  hosts: tempest_runner:overcloud
   tags:
     - stackhpc-cloud-tests
   vars:
     sct_venv: "{{ virtualenv_path }}/sct-venv"
     sct_repo: https://github.com/stackhpc/stackhpc-cloud-tests
+    # Define the version of SCT used for testing, the github workflow overrides this with
+    # stackhpc_cloud_tests_version so this is only used if running "locally".
     sct_version: main
     sct_timeout: 30
     results_path_local: "{{ lookup('env', 'HOME') }}/sct-results"
   tasks:
     - name: Stackhpc Cloud tests
       block:
+        - name: Assert that there is only one host in the tempest_runner group
+          ansible.builtin.assert:
+            that: groups.get('tempest_runner', []) | length == 1
+            fail_msg: The tempest_runner group should contain exactly one host
+
         - name: Create a temporary directory for tests repo
           ansible.builtin.tempfile:
             state: directory
@@ -45,7 +52,6 @@
         - name: Ensure required individual Python packages are installed
           ansible.builtin.pip:
             name:
-              - "{{ repo_tmpdir.path }}"
               - pytest-html
               - pytest-timeout
             virtualenv: "{{ sct_venv }}"
@@ -60,38 +66,125 @@
             file: "{{ kayobe_env_config_path }}/kolla/passwords.yml"
             name: kolla_passwords
 
-        - name: Run StackHPC Cloud tests
+        # Monitoring tests should run once, executed on the host in the
+        # tempest_runner group.
+        - name: Check for StackHPC Cloud monitoring tests
+          ansible.builtin.stat:
+            path: "{{ repo_tmpdir.path }}/stackhpc_cloud_tests/monitoring"
+          register: stackhpc_cloud_monitoring_tests
+
+        - name: Run StackHPC Cloud monitoring tests
           ansible.builtin.command:
             cmd: >
               {{ sct_venv }}/bin/py.test
-              --html={{ results_tmpdir.path }}/stackhpc-cloud-tests.html
+              --html={{ results_tmpdir.path }}/monitoring.html
               --self-contained-html
-              --pyargs stackhpc_cloud_tests
               --timeout {{ sct_timeout }}
               -rfEx
               -vv
+              "{{ repo_tmpdir.path }}/stackhpc_cloud_tests/monitoring"
           environment:
+            GRAFANA_URL: "{{ sct_grafana_url }}"
+            GRAFANA_USERNAME: "{{ sct_grafana_username }}"
+            GRAFANA_PASSWORD: "{{ sct_grafana_password }}"
             OPENSEARCH_HOSTS: "{{ sct_opensearch_hosts }}"
             OPENSEARCH_PORT: "{{ sct_opensearch_port }}"
             OPENSEARCH_TLS: "{{ sct_opensearch_tls }}"
+            OPENSEARCH_DASHBOARDS_URL: "{{ sct_opensearch_dashboards_url }}"
+            OPENSEARCH_DASHBOARDS_USERNAME: "{{ sct_opensearch_dashboards_username }}"
+            OPENSEARCH_DASHBOARDS_PASSWORD: "{{ sct_opensearch_dashboards_password }}"
             PROMETHEUS_URL: "{{ sct_prometheus_url }}"
             PROMETHEUS_USERNAME: "{{ sct_prometheus_username }}"
             PROMETHEUS_PASSWORD: "{{ sct_prometheus_password }}"
           vars:
             kolla_external_scheme: "{{ 'https' if kolla_enable_tls_external | bool else 'http' }}"
             kolla_internal_scheme: "{{ 'https' if kolla_enable_tls_internal | bool else 'http' }}"
+            sct_grafana_url: "{{ kolla_external_scheme }}://{{ kolla_external_fqdn }}:3000"
+            sct_grafana_username: "grafana_local_admin"
+            sct_grafana_password: "{{ kolla_passwords.grafana_admin_password }}"
             sct_opensearch_hosts: "{{ kolla_internal_fqdn }}"
             sct_opensearch_port: 9200
-            sct_opensearch_tls: false
+            sct_opensearch_tls: "{{ kolla_enable_tls_internal | bool }}"
+            sct_opensearch_dashboards_url: "{{ kolla_external_scheme }}://{{ kolla_external_fqdn }}:5601"
+            sct_opensearch_dashboards_username: "opensearch"
+            sct_opensearch_dashboards_password: "{{ kolla_passwords.opensearch_dashboards_password }}"
             sct_prometheus_url: "{{ kolla_internal_scheme }}://{{ kolla_internal_fqdn }}:9091"
             sct_prometheus_username: admin
             sct_prometheus_password: "{{ kolla_passwords.prometheus_password }}"
+          failed_when: monitoring_results.rc not in [0, 1]
+          register: monitoring_results
+          when: "'tempest_runner' in group_names and stackhpc_cloud_monitoring_tests.stat.exists"
+
+        # Host tests should run on every host in the overcloud group.
+        # TODO: Use TestInfra's native Ansible or SSH connection plugins for
+        # remote test execution? That would place all results in a single file
+        # and allow us to execute all tests from a single host.
+        # https://testinfra.readthedocs.io/en/latest/backends.html#connection-backends
+        - name: Check for StackHPC Cloud host tests
+          ansible.builtin.stat:
+            path: "{{ repo_tmpdir.path }}/stackhpc_cloud_tests/host"
+          register: stackhpc_cloud_host_tests
+
+        - name: Run StackHPC Cloud host tests
+          ansible.builtin.command:
+            cmd: >
+              {{ sct_venv }}/bin/py.test
+              --html={{ results_tmpdir.path }}/host-{{ inventory_hostname }}.html
+              --self-contained-html
+              --timeout {{ sct_timeout }}
+              -vv
+              "{{ repo_tmpdir.path }}/stackhpc_cloud_tests/host"
+          environment:
+            DOCKER_VERSION_MIN: "{{ sct_docker_version_min }}"
+            DOCKER_VERSION_MAX: "{{ sct_docker_version_max }}"
+            SELINUX_STATE: "{{ sct_selinux_state }}"
+          vars:
+            # Inclusive min
+            sct_docker_version_min: "24.0.0"
+            # Exclusive max
+            sct_docker_version_max: "28.0.0"
+            sct_selinux_state: "{{ selinux_state }}"
+          failed_when: host_results.rc not in [0, 1]
+          register: host_results
+          # Some host checks may need to run as root
+          become: true
+          when: "'overcloud' in group_names and stackhpc_cloud_host_tests.stat.exists"
+
+        # Host test results will be owned by root - we need to read and delete them
+        - name: Change permissions on SCT host test results
+          ansible.builtin.command:
+            cmd: chmod 666 {{ results_tmpdir.path }}/host-{{ inventory_hostname }}.html
+          become: true
+          when: "'overcloud' in group_names and stackhpc_cloud_host_tests.stat.exists"
+
       always:
-        - name: Fetch results
-          ansible.builtin.fetch:
-            src: "{{ results_tmpdir.path }}/stackhpc-cloud-tests.html"
+        - name: Synchronize results
+          ansible.posix.synchronize:
+            src: "{{ results_tmpdir.path }}/"
             dest: "{{ results_path_local }}/"
-            flat: true
+            mode: pull
+            archive: no
+            recursive: true
+            # For jump host
+            use_ssh_args: true
+
+        - name: Write a file containing failed test runs
+          ansible.builtin.copy:
+            content: |-
+              {% for host in ansible_play_hosts_all %}
+              {% if host not in ansible_play_hosts %}
+              {{ host }}: Host failure
+              {% endif %}
+              {% if hostvars[host].monitoring_results.rc | default(0) != 0 %}
+              monitoring.html
+              {% endif %}
+              {% if hostvars[host].host_results.rc | default(0) != 0 %}
+              host-{{ host }}.html
+              {% endif %}
+              {% endfor %}
+            dest: "{{ results_path_local }}/failed-tests"
+          delegate_to: localhost
+          run_once: true
 
         - name: Clean up temporary directory
           ansible.builtin.file:
@@ -100,3 +193,5 @@
           loop:
             - "{{ repo_tmpdir.path }}"
             - "{{ results_tmpdir.path }}"
+          # Some files used by host tests may now be owned by root
+          become: true
diff --git a/etc/kayobe/inventory/group_vars/all/selinux b/etc/kayobe/inventory/group_vars/all/selinux
@@ -0,0 +1,5 @@
+---
+# Target SELinux state
+# NOTE(MaxN) In StackHPC cloud tests we're checking the host's SELinux state matches the targeted state
+# but we can't access what was defined upstream so we redefine here - this must follow any upstream change.
+selinux_state: permissive