Skip to content

Commit

Permalink
workspace dockerize changes
Browse files Browse the repository at this point in the history
Signed-off-by: payalcha <[email protected]>
  • Loading branch information
payalcha committed Feb 19, 2025
1 parent 4885b8e commit 2cefc03
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 11 deletions.
84 changes: 84 additions & 0 deletions .github/workflows/task_runner_straggler_e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
---
# Task Runner E2E tests with straggler handling policy with bare metal approach

name: Task_Runner_Straggler_E2E # Please do not modify the name as it is used in the composite action

on:
schedule:
- cron: "0 7 * * *" # Run every day at 7 am UTC
workflow_dispatch:

permissions:
contents: read

jobs:
test_straggler_percentage_cutoff:
name: Straggler PercentageCutoff
if: |
(github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
(github.event_name == 'workflow_dispatch')
runs-on: ubuntu-22.04
timeout-minutes: 30
fail-fast: false # do not immediately fail if one of the combinations fail
env:
MODEL_NAME: 'torch/mnist_straggler_check'
PYTHON_VERSION: 3.10
steps:
- name: Checkout OpenFL repository
id: checkout_openfl
uses: actions/checkout@v4
with:
fetch-depth: 2 # needed for detecting changes
submodules: "true"
token: ${{ secrets.GITHUB_TOKEN }}

- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
if: ${{ always() }}

- name: Run Task Runner E2E tests with TLS
id: run_tests
run: |
python -m pytest -s tests/end_to_end/test_suites/tr_resiliency_tests.py -k test_straggler_percent_cutoff
echo "Task runner end to end test run completed"
- name: Post test run
uses: ./.github/actions/tr_post_test_run
if: ${{ always() }}
with:
test_type: "With_TLS"

test_straggler_cutoff:
name: Without TLS
if: |
(github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
(github.event_name == 'workflow_dispatch')
runs-on: ubuntu-22.04
timeout-minutes: 30
env:
MODEL_NAME: 'torch/mnist_straggler_check'
PYTHON_VERSION: 3.10
steps:
- name: Checkout OpenFL repository
id: checkout_openfl
uses: actions/checkout@v4
with:
fetch-depth: 2 # needed for detecting changes
submodules: "true"
token: ${{ secrets.GITHUB_TOKEN }}

- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
if: ${{ always() }}

- name: Run Task Runner E2E tests without TLS
id: run_tests
run: |
python -m pytest -s tests/end_to_end/test_suites/tr_resiliency_tests.py -k test_straggler_cutoff
echo "Task runner end to end test run completed"
- name: Post test run
uses: ./.github/actions/tr_post_test_run
if: ${{ always() }}
with:
test_type: "With_TLS"
10 changes: 2 additions & 8 deletions tests/end_to_end/models/model_owner.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,19 +251,13 @@ def certify_workspace(self):
except Exception as e:
raise ex.WorkspaceCertificationException(f"{error_msg}: {e}")

def dockerize_workspace(self):
def dockerize_workspace(self, image_name):
"""
Dockerize the workspace. It internally uses workspace name as the image name
"""
log.info("Dockerizing the workspace. It will take some time to complete..")
try:
if not os.getenv("GITHUB_REPOSITORY") or not os.getenv("GITHUB_BRANCH"):
repo, branch = ssh.get_git_repo_and_branch()
else:
repo = os.getenv("GITHUB_REPOSITORY")
branch = os.getenv("GITHUB_BRANCH")

cmd = f"fx workspace dockerize --save --revision {repo}@{branch}"
cmd = f"fx workspace dockerize --base-image {image_name} --save"
error_msg = "Failed to dockerize the workspace"
return_code, output, error = fh.run_command(
cmd,
Expand Down
6 changes: 4 additions & 2 deletions tests/end_to_end/test_suites/tr_resiliency_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def fx_configure_request_cutoffpolicy(request):
}
}


@pytest.mark.task_runner_basic
def test_federation_via_native_with_restarts(request, fx_federation_tr):
"""
Expand Down Expand Up @@ -93,7 +94,7 @@ def test_federation_via_native_with_restarts(request, fx_federation_tr):


@pytest.mark.straggler_tests
def test_straggler_cutoff_tests(request, fx_configure_request_cutoffpolicy, fx_federation_tr):
def test_straggler_cutoff(request, fx_configure_request_cutoffpolicy, fx_federation_tr):
"""
Test federation with stragglers
Args:
Expand Down Expand Up @@ -152,7 +153,7 @@ def test_straggler_cutoff_tests(request, fx_configure_request_cutoffpolicy, fx_f


@pytest.mark.straggler_tests
def test_straggler_percent_tests(request, fx_configure_request_percentagepolicy, fx_federation_tr):
def test_straggler_percent_cutoff(request, fx_configure_request_percentagepolicy, fx_federation_tr):
"""
Test federation with stragglers
Args:
Expand Down Expand Up @@ -209,6 +210,7 @@ def test_straggler_percent_tests(request, fx_configure_request_percentagepolicy,
f"Successfully tested federation experiment with multiple restart scenarios"
)


@pytest.mark.task_runner_dockerized_ws
def test_federation_via_dws_with_restarts(request, fx_federation_tr_dws):
"""
Expand Down
1 change: 1 addition & 0 deletions tests/end_to_end/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class ModelName(Enum):
REMOVE_OPENFL_NW = "docker network rm"
DOCKER_NETWORK_NAME = "openfl"
DEFAULT_OPENFL_IMAGE = "openfl:latest"
DEFAULT_OPENFL_DOCKERFILE = "openfl-docker/Dockerfile.base"

AGG_WORKSPACE_PATH = "{}/aggregator/workspace" # example - /tmp/my_federation/aggregator/workspace
COL_WORKSPACE_PATH = "{}/{}/workspace" # example - /tmp/my_federation/collaborator1/workspace
Expand Down
18 changes: 18 additions & 0 deletions tests/end_to_end/utils/docker_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,21 @@ def stop_start_docker_participant(participant, action):
container_names.append(container.name)

return True


def build_docker_image(image_name, dockerfile_path):
"""
Build a docker image.
Args:
image_name (str): Name of the image to build
dockerfile_path (str): Path to the Dockerfile
"""
client = get_docker_client()
log.info(f"Building docker image {image_name}")

try:
image, _ = client.images.build(path=".", dockerfile=dockerfile_path, tag=image_name)
log.info(f"Image {image_name} built successfully")
return image
except Exception as e:
raise ex.DockerException(f"Error building docker image: {e}")
36 changes: 36 additions & 0 deletions tests/end_to_end/utils/federation_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,7 @@ def federation_env_setup_and_validate(request, eval_scope=False):

workspace_path = local_bind_path

_remove_stale_processes()
# if path exists delete it
if os.path.exists(workspace_path):
shutil.rmtree(workspace_path)
Expand Down Expand Up @@ -1033,3 +1034,38 @@ def set_keras_backend(model_name):
os.environ["KERAS_BACKEND"] = backend

return [f"KERAS_BACKEND={backend}"]


def _remove_stale_processes():
"""
Remove stale processes
"""
log.info("Removing stale processes..")
# Remove any stale processes
try:
subprocess.run(
"sudo kill -9 $(ps -ef | grep -e 'collaborator' -e 'aggregator' | awk '{print $2}')",
shell=True,
check=True,
)
except subprocess.CalledProcessError as e:
log.warning(f"Failed to kill processes: {e}")


def build_docker_image(image_name, dockerfile_path):
"""
Build a docker image.
Args:
image_name (str): Name of the image to build
dockerfile_path (str): Path to the Dockerfile
"""
log.info(f"Building docker image {image_name}")

try:
subprocess.run(
f"docker build -t {image_name} -f {dockerfile_path} .",
shell=True,
check=True,
)
except Exception as e:
raise ex.DockerException(f"Error building docker image: {e}")
6 changes: 5 additions & 1 deletion tests/end_to_end/utils/tr_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import tests.end_to_end.utils.federation_helper as fh
import tests.end_to_end.utils.ssh_helper as ssh
from tests.end_to_end.models import aggregator as agg_model, model_owner as mo_model
import tests.end_to_end.utils.docker_helper as dh

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -175,9 +176,12 @@ def create_tr_dws_workspace(request, eval_scope=False):
workspace_path, local_bind_path, agg_domain_name, model_owner, plan_path, agg_workspace_path = common_workspace_creation(request, eval_scope)
model_name = request.config.model_name

# Create openfl image
fh.build_docker_image(constants.DEFAULT_OPENFL_IMAGE, constants.DEFAULT_OPENFL_DOCKERFILE)

# Command 'fx workspace dockerize --save ..' will use the workspace name for
# image name which is 'workspace' in this case.
model_owner.dockerize_workspace()
model_owner.dockerize_workspace(constants.DEFAULT_OPENFL_DOCKERFILE)
image_name = constants.DFLT_DOCKERIZE_IMAGE_NAME

# Certify the workspace in case of TLS
Expand Down

0 comments on commit 2cefc03

Please sign in to comment.