Task Runner E2E - Added Resiliency Tests (securefederatedai#1347)

* Task Runner E2E - Added Resiliency Tests Signed-off-by: noopur <[email protected]> * Correction in collaborator_memory_usage_file assignment Signed-off-by: noopur <[email protected]> * Enabled option to select a job in DWS workflow Signed-off-by: noopur <[email protected]> * Temp added resiliency to existing wf for testing Signed-off-by: noopur <[email protected]> * Temp added resiliency to existing wf for testing Signed-off-by: noopur <[email protected]> * Some logs moved from info to debug Signed-off-by: noopur <[email protected]> * Skip workspace.tar from artifacts due to its size in GBs Signed-off-by: noopur <[email protected]> * Removed resiliency tests from existing workflows Signed-off-by: noopur <[email protected]> * Code format check Signed-off-by: noopur <[email protected]> * Assert increase in current round after restart + wait Signed-off-by: noopur <[email protected]> * Reverted mandatory check for model_name from conftest.py Signed-off-by: noopur <[email protected]> * All review comments incorporated Signed-off-by: noopur <[email protected]> * Missed adding model_name back in summary help Signed-off-by: noopur <[email protected]> * Missed adding model_name back in summary help Signed-off-by: noopur <[email protected]> * Set best score as 'Not Found' if tensor.db file is not present Signed-off-by: noopur <[email protected]> * Specific error msg when db fetch fails Signed-off-by: noopur <[email protected]> * Handle FedEval case in docker Signed-off-by: noopur <[email protected]> * Check total rounds during round increment check Signed-off-by: noopur <[email protected]> * Handle multiple / in the model names Signed-off-by: noopur <[email protected]> * Skip dockerized resiliency Signed-off-by: noopur <[email protected]> --------- Signed-off-by: noopur <[email protected]> Co-authored-by: Payal Chaurasiya <[email protected]>
payalcha · Feb 10, 2025 · d8f0aaf · d8f0aaf
1 parent b18e978
commit d8f0aaf
Show file tree

Hide file tree

Showing 25 changed files with 1,497 additions and 306 deletions.
diff --git a/.github/actions/tr_post_test_run/action.yml b/.github/actions/tr_post_test_run/action.yml
@@ -20,14 +20,14 @@ runs:
         echo "Test summary printed"
       shell: bash
 
-    - name: Create Tar (exclude cert and data folders)
+    - name: Create Tar (exclude folders - cert/data/__pycache__, files - tensor.db/workspace.tar)
       id: tar_files
       if: ${{ always() }}
       run: |
-        tar -cvf result.tar --exclude="cert" --exclude="data" --exclude="__pycache__" $HOME/results
+        tar -cvf result.tar --exclude="cert" --exclude="data" --exclude="__pycache__" --exclude="tensor.db" --exclude="workspace.tar" $HOME/results
         # Model name might contain forward slashes, convert them to underscore. 
         tmp=${{ env.MODEL_NAME }}
-        echo "MODEL_NAME_MODIFIED=${tmp/\//_}" >> $GITHUB_ENV
+        echo "MODEL_NAME_MODIFIED=${tmp//\//_}" >> $GITHUB_ENV
       shell: bash
 
     - name: Upload Artifacts

diff --git a/.github/workflows/task_runner_basic_e2e.yml b/.github/workflows/task_runner_basic_e2e.yml
@@ -6,6 +6,9 @@ name: Task_Runner_E2E  # Please do not modify the name as it is used in the comp
 on:
   schedule:
     - cron: "0 0 * * *" # Run every day at midnight
+  push:
+    branches: 
+      - develop
   workflow_dispatch:
     inputs:
       num_rounds:
@@ -44,8 +47,8 @@ on:
         options:
           - all
           - test_with_tls
-          - test_with_non_tls
-          - test_with_no_client_auth
+          - test_without_tls
+          - test_without_client_auth
           - test_memory_logs
         required: false
 
@@ -65,7 +68,8 @@ jobs:
   input_selection:
     if: |
       (github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
-      (github.event_name == 'workflow_dispatch')
+      (github.event_name == 'workflow_dispatch') ||
+      (github.event.pull_request.draft == false && contains(github.event.pull_request.labels.*.name, 'task_runner_e2e'))
     name: Input value selection
     runs-on: ubuntu-22.04
     outputs:
@@ -119,7 +123,7 @@ jobs:
           fi
 
   test_with_tls:
-    name: Test with TLS
+    name: With TLS
     needs: input_selection
     if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_with_tls'
     runs-on: ubuntu-22.04
@@ -159,12 +163,12 @@ jobs:
         uses: ./.github/actions/tr_post_test_run
         if: ${{ always() }}
         with:
-          test_type: "tr_tls"
+          test_type: "With_TLS"
 
-  test_with_non_tls:
-    name: Test without TLS
+  test_without_tls:
+    name: Without TLS
     needs: input_selection
-    if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_with_non_tls'
+    if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_without_tls'
     runs-on: ubuntu-22.04
     timeout-minutes: 30
     strategy:
@@ -202,12 +206,12 @@ jobs:
         uses: ./.github/actions/tr_post_test_run
         if: ${{ always() }}
         with:
-          test_type: "tr_non_tls"
+          test_type: "Without_TLS"
 
-  test_with_no_client_auth:
-    name: Test without client auth
+  test_without_client_auth:
+    name: Without Client Auth
     needs: input_selection
-    if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_with_no_client_auth'
+    if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_without_client_auth'
     runs-on: ubuntu-22.04
     timeout-minutes: 30
     strategy:
@@ -245,10 +249,10 @@ jobs:
         uses: ./.github/actions/tr_post_test_run
         if: ${{ always() }}
         with:
-          test_type: 'tr_no_client_auth'
+          test_type: 'Without_Client_Auth'
 
   test_memory_logs:
-    name: Test memory usage
+    name: With Memory Logs
     needs: input_selection
     if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_memory_logs'
     runs-on: ubuntu-22.04
@@ -289,4 +293,4 @@ jobs:
         uses: ./.github/actions/tr_post_test_run
         if: ${{ always() }}
         with:
-          test_type: "tr_tls_memory_logs"
+          test_type: "With_Memory_Logs"
diff --git a/.github/workflows/task_runner_dockerized_ws_e2e.yml b/.github/workflows/task_runner_dockerized_ws_e2e.yml
@@ -16,6 +16,17 @@ on:
         required: false
         default: "2"
         type: string
+      jobs_to_run:
+        description: "Jobs to run"
+        type: choice
+        default: "all"
+        options:
+          - all
+          - test_with_tls
+          - test_without_tls
+          - test_without_client_auth
+          - test_memory_logs
+        required: false
 
 permissions:
   contents: read
@@ -24,10 +35,24 @@ permissions:
 env:
   NUM_ROUNDS: ${{ inputs.num_rounds || '5' }}
   NUM_COLLABORATORS: ${{ inputs.num_collaborators || '2' }}
+  JOBS_TO_RUN: ${{ inputs.jobs_to_run || 'all' }}
 
 jobs:
-  test_with_tls_dockerized_ws:
-    name: tr_tls_dockerized_ws
+  input_selection:
+    name: Input value selection
+    runs-on: ubuntu-22.04
+    outputs:
+      selected_jobs: ${{ steps.input_selection.outputs.jobs_to_run }}
+    steps:
+      - name: Job to select input values
+        id: input_selection
+        run: |
+          echo "jobs_to_run=${{ env.JOBS_TO_RUN }}" >> "$GITHUB_OUTPUT"
+
+  test_with_tls:
+    name: With TLS
+    needs: input_selection
+    if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_with_tls'
     runs-on: ubuntu-22.04
     timeout-minutes: 15
     strategy:
@@ -65,10 +90,12 @@ jobs:
         uses: ./.github/actions/tr_post_test_run
         if: ${{ always() }}
         with:
-          test_type: "tr_tls_dockerized_ws"
+          test_type: "With_TLS"
 
-  test_with_non_tls_dockerized_ws:
-    name: tr_non_tls_dockerized_ws
+  test_without_tls:
+    name: Without TLS
+    needs: input_selection
+    if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_without_tls'
     runs-on: ubuntu-22.04
     timeout-minutes: 15
     strategy:
@@ -106,10 +133,12 @@ jobs:
         uses: ./.github/actions/tr_post_test_run
         if: ${{ always() }}
         with:
-          test_type: "tr_non_tls_dockerized_ws"
+          test_type: "Without_TLS"
 
-  test_with_no_client_auth_dockerized_ws:
-    name: tr_no_client_auth_dockerized_ws
+  test_without_client_auth:
+    name: Without Client Auth
+    needs: input_selection
+    if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_without_client_auth'
     runs-on: ubuntu-22.04
     timeout-minutes: 15
     strategy:
@@ -147,10 +176,12 @@ jobs:
         uses: ./.github/actions/tr_post_test_run
         if: ${{ always() }}
         with:
-          test_type: "tr_no_client_auth_dockerized_ws"
+          test_type: "Without_Client_Auth"
 
-  test_memory_logs_dockerized_ws:
-    name: tr_tls_memory_logs_dockerized_ws
+  test_memory_logs:
+    name: With Memory Logs
+    needs: input_selection
+    if: needs.input_selection.outputs.selected_jobs == 'all' || needs.input_selection.outputs.selected_jobs == 'test_memory_logs'
     runs-on: ubuntu-22.04
     timeout-minutes: 15
     strategy:
@@ -189,4 +220,4 @@ jobs:
         uses: ./.github/actions/tr_post_test_run
         if: ${{ always() }}
         with:
-          test_type: "tr_tls_memory_logs_dockerized_ws"
+          test_type: "With_Memory_Logs"
diff --git a/.github/workflows/task_runner_e2e_resiliency.yml b/.github/workflows/task_runner_e2e_resiliency.yml
@@ -0,0 +1,159 @@
+---
+# Task Runner E2E tests for resiliency. It includes both - native and dockerized environments.
+
+name: Task_Runner_E2E_Resiliency  # Please do not modify the name as it is used in the composite action
+
+on:
+  schedule:
+    - cron: "0 5 * * *" # Run every day at 5 am UTC
+  push:
+    branches: 
+      - develop
+  workflow_dispatch:
+    inputs:
+      num_rounds:
+        description: "Number of rounds to train"
+        required: false
+        default: "50"
+        type: string
+      num_collaborators:
+        description: "Number of collaborators"
+        required: false
+        default: "2"
+        type: string
+      model_name:
+        description: "Model name"
+        required: false
+        default: "all"
+        type: choice
+        options:
+          - all
+          - torch/mnist
+          - keras/mnist
+      python_version:
+        description: "Python version"
+        required: false
+        default: "3.10"
+        type: choice
+        options:
+          - "3.10"
+          - "3.11"
+          - "3.12"
+
+permissions:
+  contents: read
+
+# Environment variables common for all the jobs
+# DO NOT use double quotes for the values of the environment variables
+env:
+  NUM_ROUNDS: ${{ inputs.num_rounds || 50 }}
+  NUM_COLLABORATORS: ${{ inputs.num_collaborators || 2 }}
+  MODEL_NAME: ${{ inputs.model_name || 'all' }}
+  PYTHON_VERSION: ${{ inputs.python_version || '3.10' }}
+
+jobs:
+  input_selection:
+    if: |
+      (github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
+      (github.event_name == 'workflow_dispatch') ||
+      (github.event.pull_request.draft == false && contains(github.event.pull_request.labels.*.name, 'task_runner_e2e'))
+    name: Input value selection
+    runs-on: ubuntu-22.04
+    outputs:
+      # Output all the variables related to models and python versions to be used in the matrix strategy
+      # for different jobs, however their usage depends on the selected job.
+      selected_models_for_tls: ${{ steps.input_selection.outputs.models_for_tls }}
+    steps:
+      - name: Job to select input values
+        id: input_selection
+        run: |
+          if [ "${{ env.MODEL_NAME }}" == "all" ]; then
+            echo "models_for_tls=[\"torch/mnist\", \"keras/mnist\"]" >> "$GITHUB_OUTPUT"
+          else
+            echo "models_for_tls=[\"${{env.MODEL_NAME}}\"]" >> "$GITHUB_OUTPUT"
+          fi
+
+  resiliency_in_native:
+    name: With TLS (Native)
+    needs: input_selection
+    runs-on: ubuntu-22.04
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        model_name: ${{ fromJson(needs.input_selection.outputs.selected_models_for_tls) }}
+      fail-fast: false # do not immediately fail if one of the combinations fail
+
+    env:
+      MODEL_NAME: ${{ matrix.model_name }}
+      PYTHON_VERSION: ${{ matrix.python_version }}
+
+    steps:
+      - name: Checkout OpenFL repository
+        id: checkout_openfl
+        uses: actions/[email protected]
+        with:
+          fetch-depth: 2 # needed for detecting changes
+          submodules: "true"
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pre test run
+        uses: ./.github/actions/tr_pre_test_run
+        if: ${{ always() }}
+
+      - name: Run Task Runner E2E tests with TLS
+        id: run_tests
+        run: |
+          python -m pytest -s tests/end_to_end/test_suites/tr_resiliency_tests.py \
+          -m task_runner_basic --model_name ${{ env.MODEL_NAME }} \
+          --num_collaborators ${{ env.NUM_COLLABORATORS }} --num_rounds ${{ env.NUM_ROUNDS }}
+          echo "Task runner end to end test run completed"
+
+      - name: Post test run
+        uses: ./.github/actions/tr_post_test_run
+        if: ${{ always() }}
+        with:
+          test_type: "Resiliency_Native" 
+
+  # TODO - Once we have GitHub runners with higher configurations, we can enable this job.
+  # resiliency_in_dws:
+  #   name: With TLS (Dockerized)
+  #   needs: input_selection
+  #   runs-on: ubuntu-22.04
+  #   timeout-minutes: 30
+  #   strategy:
+  #     matrix:
+  #       # Dockerized WS for other models require higher config runners.
+  #       # Once the issue is fixed, we can enable the tests for other models as well.
+  #       model_name: ["keras/mnist"]
+  #     fail-fast: false # do not immediately fail if one of the combinations fail
+
+  #   env:
+  #     MODEL_NAME: ${{ matrix.model_name }}
+  #     PYTHON_VERSION: ${{ matrix.python_version }}
+
+  #   steps:
+  #     - name: Checkout OpenFL repository
+  #       id: checkout_openfl
+  #       uses: actions/[email protected]
+  #       with:
+  #         fetch-depth: 2 # needed for detecting changes
+  #         submodules: "true"
+  #         token: ${{ secrets.GITHUB_TOKEN }}
+
+  #     - name: Pre test run
+  #       uses: ./.github/actions/tr_pre_test_run
+  #       if: ${{ always() }}
+
+  #     - name: Run Task Runner E2E tests with TLS
+  #       id: run_tests
+  #       run: |
+  #         python -m pytest -s tests/end_to_end/test_suites/tr_resiliency_tests.py \
+  #         -m task_runner_dockerized_ws --model_name ${{ env.MODEL_NAME }} \
+  #         --num_collaborators ${{ env.NUM_COLLABORATORS }} --num_rounds ${{ env.NUM_ROUNDS }}
+  #         echo "Task runner end to end test run completed"
+
+  #     - name: Post test run
+  #       uses: ./.github/actions/tr_post_test_run
+  #       if: ${{ always() }}
+  #       with:
+  #         test_type: "Resiliency_Dockerized"