diff --git a/.github/ensure-tester-with-images/action.yml b/.github/ensure-tester-with-images/action.yml index 34cbb7d2b13..23c773caf1e 100644 --- a/.github/ensure-tester-with-images/action.yml +++ b/.github/ensure-tester-with-images/action.yml @@ -40,7 +40,8 @@ runs: for image in ${{ inputs.builder_images_to_copy }} ; do docker images --no-trunc --quiet \$image done" > .success_key - echo "IMAGE_KEY=$(cat .success_key | md5sum | awk '{print $1}')" >> $GITHUB_ENV + # TODO better image key + echo "IMAGE_KEY=$(git rev-parse HEAD)" >> $GITHUB_ENV echo "${{ inputs.run }}" >> .success_key echo "SUCCESS_KEY=$(cat .success_key | md5sum | awk '{print $1}') " >> $GITHUB_ENV diff --git a/.github/ensure-tester/action.yml b/.github/ensure-tester/action.yml index ada040fcb94..4ad920921a5 100644 --- a/.github/ensure-tester/action.yml +++ b/.github/ensure-tester/action.yml @@ -24,19 +24,21 @@ runs: # no github runners, 'bare spot' in action code echo "runner_concurrency=0" >> $GITHUB_OUTPUT echo "ttl=30" >> $GITHUB_OUTPUT + SIZE=large if [[ $TYPE == 4core-* ]]; then - echo "instance_type=m6a.large" >> $GITHUB_OUTPUT + SIZE=large elif [[ $TYPE == 8core-* ]]; then - echo "instance_type=m6a.2xlarge" >> $GITHUB_OUTPUT + SIZE=2xlarge elif [[ $TYPE == 16core-* ]]; then - echo "instance_type=m6a.4xlarge" >> $GITHUB_OUTPUT + SIZE=4xlarge elif [[ $TYPE == 32core-* ]]; then - echo "instance_type=m6a.8xlarge" >> $GITHUB_OUTPUT + SIZE=8xlarge elif [[ $TYPE == 64core-* ]]; then - echo "instance_type=m6a.16xlarge" >> $GITHUB_OUTPUT + SIZE=16xlarge elif [[ $TYPE == 128core-* ]]; then - echo "instance_type=m6a.32xlarge" >> $GITHUB_OUTPUT + SIZE=32xlarge fi + echo "instance_type=m6a.$SIZE r6in.$SIZE r6a.$SIZE i4i.$SIZE r7iz.$SIZE" >> $GITHUB_OUTPUT - name: Start Tester uses: ./.github/spot-runner-action diff --git a/.github/spot-runner-action/dist/index.js b/.github/spot-runner-action/dist/index.js index 88382c4784d..91cca191ec4 100644 --- a/.github/spot-runner-action/dist/index.js +++ b/.github/spot-runner-action/dist/index.js @@ -246,7 +246,7 @@ class Ec2Instance { return __awaiter(this, void 0, void 0, function* () { const client = yield this.getEc2Client(); const userData = yield new userdata_1.UserData(this.config); - const userDataScript = this.config.githubActionRunnerConcurrency !== 0 ? yield userData.getUserDataForBuilder() : yield userData.getUserDataForBareSpot(); + const userDataScript = yield userData.getUserData(); const ec2InstanceTypeHash = this.getHashOfStringArray(this.config.ec2InstanceType.concat([ userDataScript, JSON.stringify(this.tags), @@ -318,8 +318,8 @@ class Ec2Instance { }, Overrides: this.config.ec2InstanceType.map((instanceType) => ({ InstanceType: instanceType, - AvailabilityZone: availabilityZone, - SubnetId: this.config.ec2SubnetId, + AvailabilityZone: this.config.githubActionRunnerConcurrency > 0 ? availabilityZone : undefined, + SubnetId: this.config.githubActionRunnerConcurrency > 0 ? this.config.ec2SubnetId : undefined, })), }; const createFleetRequest = { @@ -336,10 +336,15 @@ class Ec2Instance { const client = yield this.getEc2Client(); const fleet = yield client.createFleet(createFleetRequest).promise(); if (fleet.Errors && fleet.Errors.length > 0) { + for (const error of fleet.Errors) { + if (error.ErrorCode === "RequestLimitExceeded") { + return "RequestLimitExceeded"; + } + } core.error(JSON.stringify(fleet.Errors, null, 2)); } const instances = ((fleet === null || fleet === void 0 ? void 0 : fleet.Instances) || [])[0] || {}; - return (instances.InstanceIds || [])[0]; + return (instances.InstanceIds || [])[0] || ""; }); } getInstanceStatus(instanceId) { @@ -722,21 +727,21 @@ function requestAndWaitForSpot(config) { } let instanceId = ""; for (const ec2Strategy of ec2SpotStrategies) { + let backoff = 1; core.info(`Starting instance with ${ec2Strategy} strategy`); - // 6 * 10000ms = 1 minute per strategy + // 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff // TODO make longer lived spot request? for (let i = 0; i < 6; i++) { try { // Start instance instanceId = - (yield ec2Client.requestMachine( + yield ec2Client.requestMachine( // we fallback to on-demand - ec2Strategy.toLocaleLowerCase() === "none")) || ""; - if (instanceId) { + ec2Strategy.toLocaleLowerCase() === "none"); + // let's exit, only loop on InsufficientInstanceCapacity + if (instanceId !== "RequestLimitExceeded") { break; } - // let's exit, only loop on InsufficientInstanceCapacity - break; } catch (error) { // TODO is this still the relevant error? @@ -752,7 +757,7 @@ function requestAndWaitForSpot(config) { } } // wait 10 seconds - yield new Promise((r) => setTimeout(r, 10000)); + yield new Promise((r) => setTimeout(r, 10000 * Math.pow(2, backoff))); } if (instanceId) { core.info("Successfully requested instance with ID " + instanceId); @@ -1020,25 +1025,7 @@ class UserData { constructor(config) { this.config = config; } - getUserDataForBareSpot() { - return __awaiter(this, void 0, void 0, function* () { - const cmds = [ - "#!/bin/bash", - `exec 1>/run/log.out 2>&1`, - `shutdown -P +${this.config.ec2InstanceTtl}`, - `echo '{"default-address-pools":[{"base":"172.17.0.0/12","size":20}, {"base":"10.99.0.0/12","size":20}, {"base":"192.168.0.0/16","size":24}]}' > /etc/docker/daemon.json`, - `sudo service docker restart`, - "sudo apt install -y brotli", - // NOTE also update versions below and in .github/ci-setup-action/action.yml - "sudo wget -q https://github.com/earthly/earthly/releases/download/v0.8.9/earthly-linux-$(dpkg --print-architecture) -O /usr/local/bin/earthly", - "sudo chmod +x /usr/local/bin/earthly", - "touch /home/ubuntu/.user-data-finished", - ]; - console.log("Sending: ", cmds.filter((x) => !x.startsWith("TOKENS")).join("\n")); - return Buffer.from(cmds.join("\n")).toString("base64"); - }); - } - getUserDataForBuilder() { + getUserData() { return __awaiter(this, void 0, void 0, function* () { if (!this.config.githubActionRunnerLabel) throw Error("failed to object job ID for label"); @@ -1053,9 +1040,10 @@ class UserData { `sudo service docker restart`, "sudo wget -q https://github.com/earthly/earthly/releases/download/v0.8.9/earthly-linux-$(dpkg --print-architecture) -O /usr/local/bin/earthly", "sudo chmod +x /usr/local/bin/earthly", - "cd /run", "sudo apt install -y brotli", 'echo "MaxStartups 1000" >> /etc/ssh/sshd_config', + 'echo "ClientAliveInterval=30" >> /etc/ssh/sshd_config', + 'echo "ClientAliveCountMax=20" >> /etc/ssh/sshd_config', "sudo service sshd restart", "touch /home/ubuntu/.user-data-finished", ]; diff --git a/.github/spot-runner-action/src/ec2.ts b/.github/spot-runner-action/src/ec2.ts index 4f2bb92b9dd..a00ca587b6c 100644 --- a/.github/spot-runner-action/src/ec2.ts +++ b/.github/spot-runner-action/src/ec2.ts @@ -161,7 +161,7 @@ export class Ec2Instance { const userData = await new UserData( this.config ); - const userDataScript = this.config.githubActionRunnerConcurrency !== 0 ? await userData.getUserDataForBuilder() : await userData.getUserDataForBareSpot(); + const userDataScript = await userData.getUserData(); const ec2InstanceTypeHash = this.getHashOfStringArray( this.config.ec2InstanceType.concat([ userDataScript, @@ -225,7 +225,7 @@ export class Ec2Instance { return launchTemplateName; } - async requestMachine(useOnDemand: boolean): Promise { + async requestMachine(useOnDemand: boolean): Promise { // Note advice re max bid: "If you specify a maximum price, your instances will be interrupted more frequently than if you do not specify this parameter." const launchTemplateName = await this.getLaunchTemplate(); // Launch template name already in use @@ -237,8 +237,8 @@ export class Ec2Instance { }, Overrides: this.config.ec2InstanceType.map((instanceType) => ({ InstanceType: instanceType, - AvailabilityZone: availabilityZone, - SubnetId: this.config.ec2SubnetId, + AvailabilityZone: this.config.githubActionRunnerConcurrency > 0 ? availabilityZone : undefined, + SubnetId: this.config.githubActionRunnerConcurrency > 0 ? this.config.ec2SubnetId : undefined, })), }; const createFleetRequest: CreateFleetRequest = { @@ -255,10 +255,15 @@ export class Ec2Instance { const client = await this.getEc2Client(); const fleet = await client.createFleet(createFleetRequest).promise(); if (fleet.Errors && fleet.Errors.length > 0) { + for (const error of fleet.Errors) { + if (error.ErrorCode === "RequestLimitExceeded") { + return "RequestLimitExceeded"; + } + } core.error(JSON.stringify(fleet.Errors, null, 2)); } const instances: CreateFleetInstance = (fleet?.Instances || [])[0] || {}; - return (instances.InstanceIds || [])[0]; + return (instances.InstanceIds || [])[0] || ""; } async getInstanceStatus(instanceId: string) { diff --git a/.github/spot-runner-action/src/main.ts b/.github/spot-runner-action/src/main.ts index a7b09b6700e..5444cdd90cf 100644 --- a/.github/spot-runner-action/src/main.ts +++ b/.github/spot-runner-action/src/main.ts @@ -61,22 +61,22 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise { let instanceId = ""; for (const ec2Strategy of ec2SpotStrategies) { + let backoff = 1; core.info(`Starting instance with ${ec2Strategy} strategy`); - // 6 * 10000ms = 1 minute per strategy + // 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff // TODO make longer lived spot request? for (let i = 0; i < 6; i++) { try { // Start instance instanceId = - (await ec2Client.requestMachine( + await ec2Client.requestMachine( // we fallback to on-demand ec2Strategy.toLocaleLowerCase() === "none" - )) || ""; - if (instanceId) { + ); + // let's exit, only loop on InsufficientInstanceCapacity + if (instanceId !== "RequestLimitExceeded") { break; } - // let's exit, only loop on InsufficientInstanceCapacity - break; } catch (error) { // TODO is this still the relevant error? if ( @@ -94,7 +94,7 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise { } } // wait 10 seconds - await new Promise((r) => setTimeout(r, 10000)); + await new Promise((r) => setTimeout(r, 10000 * 2 ** backoff)); } if (instanceId) { core.info("Successfully requested instance with ID " + instanceId); diff --git a/.github/spot-runner-action/src/userdata.ts b/.github/spot-runner-action/src/userdata.ts index 83e276ade70..5055ed867fc 100644 --- a/.github/spot-runner-action/src/userdata.ts +++ b/.github/spot-runner-action/src/userdata.ts @@ -8,27 +8,7 @@ export class UserData { this.config = config; } - async getUserDataForBareSpot(): Promise { - const cmds = [ - "#!/bin/bash", - `exec 1>/run/log.out 2>&1`, // Log to /run/log.out - `shutdown -P +${this.config.ec2InstanceTtl}`, - `echo '{"default-address-pools":[{"base":"172.17.0.0/12","size":20}, {"base":"10.99.0.0/12","size":20}, {"base":"192.168.0.0/16","size":24}]}' > /etc/docker/daemon.json`, - `sudo service docker restart`, - "sudo apt install -y brotli", - // NOTE also update versions below and in .github/ci-setup-action/action.yml - "sudo wget -q https://github.com/earthly/earthly/releases/download/v0.8.9/earthly-linux-$(dpkg --print-architecture) -O /usr/local/bin/earthly", - "sudo chmod +x /usr/local/bin/earthly", - "touch /home/ubuntu/.user-data-finished", - ]; - console.log( - "Sending: ", - cmds.filter((x) => !x.startsWith("TOKENS")).join("\n") - ); - return Buffer.from(cmds.join("\n")).toString("base64"); - } - - async getUserDataForBuilder(): Promise { + async getUserData(): Promise { if (!this.config.githubActionRunnerLabel) throw Error("failed to object job ID for label"); // Note, we dont make the runner ephemeral as we start fresh runners as needed @@ -42,9 +22,10 @@ export class UserData { `sudo service docker restart`, "sudo wget -q https://github.com/earthly/earthly/releases/download/v0.8.9/earthly-linux-$(dpkg --print-architecture) -O /usr/local/bin/earthly", "sudo chmod +x /usr/local/bin/earthly", - "cd /run", "sudo apt install -y brotli", 'echo "MaxStartups 1000" >> /etc/ssh/sshd_config', + 'echo "ClientAliveInterval=30" >> /etc/ssh/sshd_config', + 'echo "ClientAliveCountMax=20" >> /etc/ssh/sshd_config', "sudo service sshd restart", "touch /home/ubuntu/.user-data-finished", ]; diff --git a/scripts/run_on_builder b/scripts/run_on_builder index bfbe1783545..888a0dc5d0c 100755 --- a/scripts/run_on_builder +++ b/scripts/run_on_builder @@ -4,4 +4,4 @@ set -eu # Enter the repo root cd "$(dirname "$0")/.." -ssh -o StrictHostKeyChecking=no -i "$BUILDER_SPOT_KEY" ubuntu@"$BUILDER_SPOT_IP" "$@" \ No newline at end of file +ssh -o TCPKeepAlive=no -o ServerAliveCountMax=20 -o ServerAliveInterval=30 -o StrictHostKeyChecking=no -i "$BUILDER_SPOT_KEY" ubuntu@"$BUILDER_SPOT_IP" "$@" \ No newline at end of file diff --git a/scripts/run_on_tester b/scripts/run_on_tester index 3ca77b73b4a..075a3b5cbe5 100755 --- a/scripts/run_on_tester +++ b/scripts/run_on_tester @@ -4,4 +4,4 @@ set -eu # Enter the repo root cd "$(dirname "$0")/.." -ssh -o StrictHostKeyChecking=no -i "$SPOT_KEY" ubuntu@"$SPOT_IP" "$@" \ No newline at end of file +ssh -o TCPKeepAlive=no -o ServerAliveCountMax=20 -o ServerAliveInterval=30 -o StrictHostKeyChecking=no -i "$SPOT_KEY" ubuntu@"$SPOT_IP" "$@" \ No newline at end of file