From 286bb0bace7a1cdfc065213f04e82572bd9714ca Mon Sep 17 00:00:00 2001 From: "Alex Ellis (OpenFaaS Ltd)" Date: Fri, 20 Oct 2023 14:58:28 +0100 Subject: [PATCH 1/2] Switch over to managed Arm64 hosts This change switches over from 2x self-managed runners where side effects are possible between builds, to a pool of servers where each build runs in an isolated VM. The service is provided by actuated.dev, and sponsored by both Ampere and the CNCF. Signed-off-by: Alex Ellis (OpenFaaS Ltd) --- .github/workflows/e2e-arm64-template.yaml | 2 +- .github/workflows/robustness-nightly.yaml | 4 +- .github/workflows/tests-arm64-template.yaml | 2 +- Documentation/infra-guide/arm64-infra.md | 129 ++------------------ 4 files changed, 12 insertions(+), 125 deletions(-) diff --git a/.github/workflows/e2e-arm64-template.yaml b/.github/workflows/e2e-arm64-template.yaml index 2ef44639341..8301870d0ef 100644 --- a/.github/workflows/e2e-arm64-template.yaml +++ b/.github/workflows/e2e-arm64-template.yaml @@ -11,7 +11,7 @@ jobs: test: # this is to prevent the job to run at forked projects if: github.repository == 'etcd-io/etcd' - runs-on: [self-hosted, Linux, ARM64] + runs-on: actuated-arm64-8cpu-32gb container: golang:1.21-bookworm defaults: run: diff --git a/.github/workflows/robustness-nightly.yaml b/.github/workflows/robustness-nightly.yaml index e3e1d51f3cd..f027e50644f 100644 --- a/.github/workflows/robustness-nightly.yaml +++ b/.github/workflows/robustness-nightly.yaml @@ -23,7 +23,7 @@ jobs: count: 80 testTimeout: 200m artifactName: main-arm64 - runs-on: "['self-hosted', 'Linux', 'ARM64']" + runs-on: actuated-arm64-8cpu-32gb release-35: uses: ./.github/workflows/robustness-template.yaml with: @@ -39,7 +39,7 @@ jobs: count: 100 testTimeout: 200m artifactName: release-35-arm64 - runs-on: "['self-hosted', 'Linux', 'ARM64']" + runs-on: actuated-arm64-8cpu-32gb release-34: uses: ./.github/workflows/robustness-template.yaml with: diff --git a/.github/workflows/tests-arm64-template.yaml b/.github/workflows/tests-arm64-template.yaml index 4721af53684..a0ee72a39b5 100644 --- a/.github/workflows/tests-arm64-template.yaml +++ b/.github/workflows/tests-arm64-template.yaml @@ -16,7 +16,7 @@ jobs: test: # this is to prevent the job to run at forked projects if: github.repository == 'etcd-io/etcd' - runs-on: [self-hosted, Linux, ARM64] + runs-on: actuated-arm64-8cpu-32gb container: golang:1.21-bookworm defaults: run: diff --git a/Documentation/infra-guide/arm64-infra.md b/Documentation/infra-guide/arm64-infra.md index 90b35fde384..02c0e79726d 100644 --- a/Documentation/infra-guide/arm64-infra.md +++ b/Documentation/infra-guide/arm64-infra.md @@ -1,130 +1,17 @@ # etcd arm64 test infrastructure -## Infrastructure summary +The infrastructure to build for arm64 is provided by [Equinix Metal](https://www.equinix.com/) via the [CNCF Community Infrastructure Lab](https://github.com/cncf/cluster/issues). -All etcd project pipelines run via github actions. The etcd project currently maintains dedicated infrastructure for running `arm64` continuous integration testing. This is required because currently github actions runner virtual machines are only offered as `x64`. +Previously, several maintainers were responsible for managing two bare-metal machines with a self-hosted runner installed. This was a manual process, and side effects could be left over from previous builds. -The infrastructure consists of two `c3.large.arm` bare metal servers kindly provided by [Equinix Metal](https://www.equinix.com/) via the [CNCF Community Infrastructure Lab]. +As part of a joint program between Ampere and the CNCF, [actuated.dev](https://actuated.dev) is providing managed Arm64 builds. -| Hostname | IP | Operating System | Region | -|-------------------------------|----------------|--------------------|---------------| -| etcd-c3-large-arm64-runner-01 | 86.109.7.233 | Ubuntu 22.04.1 LTS | Washington DC | -| etcd-c3-large-arm64-runner-02 | 147.28.151.226 | Ubuntu 22.04.1 LTS | Washington DC | +To use the new infrastructure, add the following to your workflow: -## Infrastructure support - -The etcd project aims to self manage and resolve issues with project infrastructure internally where possible, however if situations emerge where we need to engage support from Equinix Metal we can open an issue under the [CNCF Community Infrastructure Lab] project or contact the [Equinix Metal support team](https://deploy.equinix.com/support). If the situation is urgent contact @vielmetti directly who can provide further assistance or escalation points. - -## Granting infrastructure access - -Etcd arm64 test infrastructure access is closely controlled to ensure the infrastructure is secure and protect the integrity of the etcd project. - -Access to the infrastructure is defined by the infra admins table below: - -| Name | Github | K8s Slack | Email | -|---------------------------|----------------|--------------------|--------------------| -| Marek Siarkowicz | @serathius | @ Serathius | Ref MAINTAINERS | -| Benjamin Wang | @ahrtr | @ Benjamin Wang | Ref MAINTAINERS | -| Davanum Srinivas | @dims | @ Dims | davanum@gmail.com | -| Chao Chen | @chaochn47 | @ Chao Chen | chaochn@amazon.com | -| James Blair | @jmhbnz | @ James Blair | Ref MAINTAINERS | - -Individuals in this table are granted access to the infrastructure in two ways: - -### 1. Equinix metal web console access - -An etcd project exists under the CNCF organisation in the Equinix Metal web console. The direct url to the etcd console is . - -When a new person is added to the infra admins table, an existing member or etcd maintainer should raise an issue in the [CNCF Community Infrastructure Labs](https://github.com/cncf/cluster/issues) to ensure they are granted web console access. - -Refer to example issue: . - -### 2. Server ssh access - -Infra admins can ssh directly to the servers with a dedicated user account for each person, usernames are based on github handles for easy recognition in logs. These infra admins will be able to elevate to the `root` user when necessary via `sudo`. - -Access to machines via ssh is strictly via individual ssh key based authentication, and is not permitted directly to the `root` user. Password authentication is never to be used for etcd infrastructure ssh authentication. - -When a new member is added to the infra admins table, and existing member with ssh access should complete the following actions on all etcd servers: - -- create the new user via `sudo adduser `. -- add their public key to `/home//.ssh/authorized_keys` file. Note: Public keys are to be retrieved via github only, example: . -- add the new user to machine sudoers file via `usermod -aG sudo `. - -## Revoking infrastructure access - -When a member is removed from the infra admins table existing members must review servers and ensure their user access to etcd infrastructure is revoked by removing the members `/home//.ssh/authorized_keys` entries. - -Note: When revoking access do not delete a user or their home directory from servers, as access may need to be reinstated in future. - -### Regular access review - -On a regular at least quarterly basis members of the infra admins team are responsible for verifying that no unneccessary infrastructure access exists by reviewing membership of the table above and existing server access. - -## Provisioning new machines - -If the etcd project needs new `arm64` infrastructure we can open an issue with the [CNCF Community Infrastructure Lab]. An example etcd request is [here](https://github.com/cncf/cluster/issues/227). - -Note: `arm64` compute capacity is not currently available in all regions, this can be checked with [metal-cli](https://github.com/equinix/metal-cli) `metal capacity get | grep arm`. - -[CNCF Community Infrastructure Lab]: https://github.com/cncf/cluster/issues - -### Setting up a new github actions runner - -Once the new blank machine has been provisioned it needs to be set up as a github actions runner to be able to accept etcd workflow jobs. Follow the steps below to complete this: - -1. **Install pre-requisites** - -With etcd jobs running inside containers we need to ensure the `docker` container engine is present on the machine. We use the `docker.io` package maintained by Ubuntu for this however [official instructions from Docker](https://docs.docker.com/engine/install/ubuntu) are available for reference. - -```bash -# Ensure all packages are up to date -sudo apt update && sudo apt upgrade - -# Install pre-requisites -sudo apt install --yes build-essential git wget curl docker.io - -# Check the docker service is now started and enabled -sudo systemctl status docker.service && sudo docker ps -``` - -2. **Create the runner user** - -For security reasons we do not run the github actions runner as `root`, instead we create a new user `runner` and assign it `docker` permissions via group. - -```bash -# Create new user -sudo adduser runner - -# Grant permissions -sudo usermod -aG docker runner +```yaml +runs-on: actuated-arm64-8cpu-32gb ``` -3. **Follow runner create instructions** +The vCPUs and RAM are customizable, i.e. `actuated-arm64-8cpu-16gb` or `actuated-arm64-8cpu-32gb`. -Once pre-requisites are done we can setup the new runner. Rather than reinvent the wheel we can follow existing Github maintained [documentation](https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/adding-self-hosted-runners#adding-a-self-hosted-runner-to-a-repository). - -This will essentially require a maintainer navigating to the following url and following the generated steps . - -Switch to the `runner` user and ensure you are in that users home directory before running the generated setup steps. - -```bash -sudo su runner && cd /home/runner -``` - -4. **Test and start actions runner** - -For a final verification, before we start the runner we should check the docker access setup above is working. - -If all is well we can start the runner! - -```bash -# Switch to the runner user -sudo su runner - -# Test runner can docker ps -docker ps - -# Start the runner if all is working -cd /home/runner/actions-runner && nohup ./run.sh & -``` +For urgent support, contact @alexellis or the [actuated team](https://actuated.dev). From a83f58019b796adfab6aba9e76208cfd12d90187 Mon Sep 17 00:00:00 2001 From: "Alex Ellis (OpenFaaS Ltd)" Date: Tue, 24 Oct 2023 10:47:59 +0100 Subject: [PATCH 2/2] Enable manual testing of e2e-arm64-nightly job Adding workflow_dispatch as an "on" trigger enables manual testing by maintainers, without having to wait for the nightly cron schedule. @ahrtr requested this temporary change in order to trigger the arm64 jobs via CI. Signed-off-by: Alex Ellis (OpenFaaS Ltd) --- .github/workflows/e2e-arm64-nightly.yaml | 10 ++++++++++ .github/workflows/robustness-nightly.yaml | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/.github/workflows/e2e-arm64-nightly.yaml b/.github/workflows/e2e-arm64-nightly.yaml index abaec7dfead..4261bb082b3 100644 --- a/.github/workflows/e2e-arm64-nightly.yaml +++ b/.github/workflows/e2e-arm64-nightly.yaml @@ -6,6 +6,16 @@ on: # with individual checkout actions for each of the active release branches schedule: - cron: '30 1 * * *' # runs daily at 1:30 am. + + # @alexellis + # Temporary triggers requested by @ahrtr, remove after merging + # PR: 16801 + push: + pull_request: + + # workflow_dispatch enables manual testing of this job by maintainers + workflow_dispatch: + jobs: main-arm64: uses: ./.github/workflows/e2e-arm64-template.yaml diff --git a/.github/workflows/robustness-nightly.yaml b/.github/workflows/robustness-nightly.yaml index f027e50644f..9de5be2b659 100644 --- a/.github/workflows/robustness-nightly.yaml +++ b/.github/workflows/robustness-nightly.yaml @@ -6,6 +6,16 @@ on: # with individual checkout actions for each of the active release branches schedule: - cron: '25 9 * * *' # runs every day at 09:25 UTC + + # workflow_dispatch enables manual testing of this job by maintainers + workflow_dispatch: + + # @alexellis + # Temporary triggers requested by @ahrtr, remove after merging + # PR: 16801 + push: + pull_request: + jobs: main: # GHA has a maximum amount of 6h execution time, we try to get done within 3h