forked from securefederatedai/openfl
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Task Runner E2E - Added Resiliency Tests (securefederatedai#1347)
* Task Runner E2E - Added Resiliency Tests Signed-off-by: noopur <[email protected]> * Correction in collaborator_memory_usage_file assignment Signed-off-by: noopur <[email protected]> * Enabled option to select a job in DWS workflow Signed-off-by: noopur <[email protected]> * Temp added resiliency to existing wf for testing Signed-off-by: noopur <[email protected]> * Temp added resiliency to existing wf for testing Signed-off-by: noopur <[email protected]> * Some logs moved from info to debug Signed-off-by: noopur <[email protected]> * Skip workspace.tar from artifacts due to its size in GBs Signed-off-by: noopur <[email protected]> * Removed resiliency tests from existing workflows Signed-off-by: noopur <[email protected]> * Code format check Signed-off-by: noopur <[email protected]> * Assert increase in current round after restart + wait Signed-off-by: noopur <[email protected]> * Reverted mandatory check for model_name from conftest.py Signed-off-by: noopur <[email protected]> * All review comments incorporated Signed-off-by: noopur <[email protected]> * Missed adding model_name back in summary help Signed-off-by: noopur <[email protected]> * Missed adding model_name back in summary help Signed-off-by: noopur <[email protected]> * Set best score as 'Not Found' if tensor.db file is not present Signed-off-by: noopur <[email protected]> * Specific error msg when db fetch fails Signed-off-by: noopur <[email protected]> * Handle FedEval case in docker Signed-off-by: noopur <[email protected]> * Check total rounds during round increment check Signed-off-by: noopur <[email protected]> * Handle multiple / in the model names Signed-off-by: noopur <[email protected]> * Skip dockerized resiliency Signed-off-by: noopur <[email protected]> --------- Signed-off-by: noopur <[email protected]> Co-authored-by: Payal Chaurasiya <[email protected]>
- Loading branch information
1 parent
b18e978
commit d8f0aaf
Showing
25 changed files
with
1,497 additions
and
306 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
--- | ||
# Task Runner E2E tests for resiliency. It includes both - native and dockerized environments. | ||
|
||
name: Task_Runner_E2E_Resiliency # Please do not modify the name as it is used in the composite action | ||
|
||
on: | ||
schedule: | ||
- cron: "0 5 * * *" # Run every day at 5 am UTC | ||
push: | ||
branches: | ||
- develop | ||
workflow_dispatch: | ||
inputs: | ||
num_rounds: | ||
description: "Number of rounds to train" | ||
required: false | ||
default: "50" | ||
type: string | ||
num_collaborators: | ||
description: "Number of collaborators" | ||
required: false | ||
default: "2" | ||
type: string | ||
model_name: | ||
description: "Model name" | ||
required: false | ||
default: "all" | ||
type: choice | ||
options: | ||
- all | ||
- torch/mnist | ||
- keras/mnist | ||
python_version: | ||
description: "Python version" | ||
required: false | ||
default: "3.10" | ||
type: choice | ||
options: | ||
- "3.10" | ||
- "3.11" | ||
- "3.12" | ||
|
||
permissions: | ||
contents: read | ||
|
||
# Environment variables common for all the jobs | ||
# DO NOT use double quotes for the values of the environment variables | ||
env: | ||
NUM_ROUNDS: ${{ inputs.num_rounds || 50 }} | ||
NUM_COLLABORATORS: ${{ inputs.num_collaborators || 2 }} | ||
MODEL_NAME: ${{ inputs.model_name || 'all' }} | ||
PYTHON_VERSION: ${{ inputs.python_version || '3.10' }} | ||
|
||
jobs: | ||
input_selection: | ||
if: | | ||
(github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') || | ||
(github.event_name == 'workflow_dispatch') || | ||
(github.event.pull_request.draft == false && contains(github.event.pull_request.labels.*.name, 'task_runner_e2e')) | ||
name: Input value selection | ||
runs-on: ubuntu-22.04 | ||
outputs: | ||
# Output all the variables related to models and python versions to be used in the matrix strategy | ||
# for different jobs, however their usage depends on the selected job. | ||
selected_models_for_tls: ${{ steps.input_selection.outputs.models_for_tls }} | ||
steps: | ||
- name: Job to select input values | ||
id: input_selection | ||
run: | | ||
if [ "${{ env.MODEL_NAME }}" == "all" ]; then | ||
echo "models_for_tls=[\"torch/mnist\", \"keras/mnist\"]" >> "$GITHUB_OUTPUT" | ||
else | ||
echo "models_for_tls=[\"${{env.MODEL_NAME}}\"]" >> "$GITHUB_OUTPUT" | ||
fi | ||
resiliency_in_native: | ||
name: With TLS (Native) | ||
needs: input_selection | ||
runs-on: ubuntu-22.04 | ||
timeout-minutes: 30 | ||
strategy: | ||
matrix: | ||
model_name: ${{ fromJson(needs.input_selection.outputs.selected_models_for_tls) }} | ||
fail-fast: false # do not immediately fail if one of the combinations fail | ||
|
||
env: | ||
MODEL_NAME: ${{ matrix.model_name }} | ||
PYTHON_VERSION: ${{ matrix.python_version }} | ||
|
||
steps: | ||
- name: Checkout OpenFL repository | ||
id: checkout_openfl | ||
uses: actions/[email protected] | ||
with: | ||
fetch-depth: 2 # needed for detecting changes | ||
submodules: "true" | ||
token: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Pre test run | ||
uses: ./.github/actions/tr_pre_test_run | ||
if: ${{ always() }} | ||
|
||
- name: Run Task Runner E2E tests with TLS | ||
id: run_tests | ||
run: | | ||
python -m pytest -s tests/end_to_end/test_suites/tr_resiliency_tests.py \ | ||
-m task_runner_basic --model_name ${{ env.MODEL_NAME }} \ | ||
--num_collaborators ${{ env.NUM_COLLABORATORS }} --num_rounds ${{ env.NUM_ROUNDS }} | ||
echo "Task runner end to end test run completed" | ||
- name: Post test run | ||
uses: ./.github/actions/tr_post_test_run | ||
if: ${{ always() }} | ||
with: | ||
test_type: "Resiliency_Native" | ||
|
||
# TODO - Once we have GitHub runners with higher configurations, we can enable this job. | ||
# resiliency_in_dws: | ||
# name: With TLS (Dockerized) | ||
# needs: input_selection | ||
# runs-on: ubuntu-22.04 | ||
# timeout-minutes: 30 | ||
# strategy: | ||
# matrix: | ||
# # Dockerized WS for other models require higher config runners. | ||
# # Once the issue is fixed, we can enable the tests for other models as well. | ||
# model_name: ["keras/mnist"] | ||
# fail-fast: false # do not immediately fail if one of the combinations fail | ||
|
||
# env: | ||
# MODEL_NAME: ${{ matrix.model_name }} | ||
# PYTHON_VERSION: ${{ matrix.python_version }} | ||
|
||
# steps: | ||
# - name: Checkout OpenFL repository | ||
# id: checkout_openfl | ||
# uses: actions/[email protected] | ||
# with: | ||
# fetch-depth: 2 # needed for detecting changes | ||
# submodules: "true" | ||
# token: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
# - name: Pre test run | ||
# uses: ./.github/actions/tr_pre_test_run | ||
# if: ${{ always() }} | ||
|
||
# - name: Run Task Runner E2E tests with TLS | ||
# id: run_tests | ||
# run: | | ||
# python -m pytest -s tests/end_to_end/test_suites/tr_resiliency_tests.py \ | ||
# -m task_runner_dockerized_ws --model_name ${{ env.MODEL_NAME }} \ | ||
# --num_collaborators ${{ env.NUM_COLLABORATORS }} --num_rounds ${{ env.NUM_ROUNDS }} | ||
# echo "Task runner end to end test run completed" | ||
|
||
# - name: Post test run | ||
# uses: ./.github/actions/tr_post_test_run | ||
# if: ${{ always() }} | ||
# with: | ||
# test_type: "Resiliency_Dockerized" |
Oops, something went wrong.