-
Notifications
You must be signed in to change notification settings - Fork 310
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: more robust spot shutdown + CI commandline (#5825)
Github Actions runners are a game, and if you get assigned a runner that is going down, you lose. This includes some logic inspired by a github actions controller for kubernetes to bring down runners gracefully. also includes a ci.py script you can use: ./ci.py print("1. SSH into build machine") print("2. SSH into bench machine") print("3. Start/Stop spot machines") print("4. Manage Running Jobs") 4 has more subactions as well for viewing, cancelling, rerunning (including running spot as recommended for fail reruns) from commandline
- Loading branch information
Showing
7 changed files
with
194 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#!/usr/bin/env python3 | ||
# ubuntu: apt install python3-blessed | ||
from blessed import Terminal | ||
import os, json, subprocess, sys | ||
|
||
term = Terminal() | ||
if 'GITHUB_ACTOR' not in os.environ: | ||
print("Make sure you have GITHUB_ACTOR in your environment variables e.g. .zshrc") | ||
sys.exit(1) | ||
GITHUB_ACTOR = os.environ['GITHUB_ACTOR'] | ||
|
||
def main(): | ||
selection = -1 | ||
with term.fullscreen(), term.cbreak(): | ||
print(term.home + term.clear) | ||
while selection not in ('1', '2', '3', '4', 'q'): | ||
print(term.move_y(1) + "Please select an option:") | ||
print("1. SSH into build machine") | ||
print("2. SSH into bench machine") | ||
print("3. Start/Stop spot machines") | ||
print("4. Manage Running Jobs") | ||
print("q. Quit") | ||
with term.location(0, term.height - 1): | ||
selection = term.inkey() | ||
|
||
if selection == '1': | ||
ssh_into_machine('x86') | ||
elif selection == '2': | ||
ssh_into_machine('bench-x86') | ||
elif selection == '3': | ||
manage_spot_instances() | ||
elif selection == '4': | ||
manage_ci_workflows() | ||
|
||
def ssh_into_machine(suffix): | ||
GITHUB_ACTOR = os.getenv('GITHUB_ACTOR', 'default_actor') | ||
ssh_key_path = os.path.expanduser('~/.ssh/build_instance_key') | ||
if not os.path.exists(ssh_key_path): | ||
print("SSH key does not exist.") | ||
return | ||
|
||
# Command to get the instance information | ||
cmd = f'aws ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=aztec-packages-{GITHUB_ACTOR}-{suffix}" --output json --region us-east-2' | ||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True) | ||
if result.returncode != 0: | ||
print("Failed to get AWS instances:", result.stderr) | ||
return | ||
|
||
# Parse the output to find the public IP address | ||
try: | ||
instances_data = json.loads(result.stdout) | ||
instance = instances_data['Reservations'][0]['Instances'][0] | ||
instance_ip = instance['PublicIpAddress'] | ||
except (KeyError, IndexError, json.JSONDecodeError) as e: | ||
print("Error parsing AWS CLI output:", e) | ||
return | ||
|
||
# SSH command using the public IP | ||
ssh_cmd = f"ssh -o StrictHostKeychecking=no -i {ssh_key_path} ubuntu@{instance_ip}" | ||
print(f"Connecting to {instance_ip}. Consider delaying the impeding shutdown.") | ||
ssh_process = subprocess.Popen(ssh_cmd, shell=True) | ||
ssh_process.wait() # Wait for the SSH session to complete | ||
|
||
def manage_spot_instances(): | ||
action = input("Enter 'start' to run or 'stop' to stop spot instances: ") | ||
if action == 'start': | ||
subprocess.run('gh workflow run start-spot.yml', shell=True) | ||
elif action == 'stop': | ||
subprocess.run('gh workflow run stop-spot.yml', shell=True) | ||
|
||
def manage_ci_workflows(): | ||
# Retrieve the most recent workflow run | ||
cmd = f"gh run list --workflow=ci.yml -u {GITHUB_ACTOR} --limit 5" | ||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True) | ||
if result.returncode != 0 or not result.stdout.strip(): | ||
print("Failed to retrieve workflow runs or no runs found.") | ||
return | ||
print("Most recent CI run details:") | ||
print(result.stdout) | ||
|
||
action = input("Enter action 'cancel', 'rerun', 'rerun-all', 'force-cancel' or 'view' (default)") or 'view' | ||
print(f"\nWill perform {action}") | ||
run_id = input(f"Enter the run ID to {action}: ") | ||
|
||
if action.lower() == 'cancel': | ||
subprocess.run(f"gh run cancel {run_id}", shell=True) | ||
if action.lower() == 'rerun': | ||
# needed so the spot runners still work | ||
subprocess.run('gh workflow run start-spot.yml', shell=True) | ||
subprocess.run(f"gh run rerun {run_id} --failed", shell=True) | ||
elif action.lower() == 'rerun-all': | ||
subprocess.run(f"gh run rerun {run_id}", shell=True) | ||
elif action.lower() == 'force-cancel': | ||
subprocess.run('gh api --method POST -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" ' + | ||
'/repos/AztecProtocol/aztec-packages/actions/runs/' + run_id + '/force-cancel', shell=True) | ||
else: | ||
subprocess.run(f"gh run watch {run_id}", shell=True) | ||
|
||
if __name__ == "__main__": | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/bash | ||
set -eux | ||
|
||
MAX_WAIT_TIME=300 # Maximum wait time in seconds | ||
WAIT_INTERVAL=10 # Interval between checks in seconds | ||
elapsed_time=0 | ||
|
||
exec &> >(tee -a /run/.maybe-exit-log) | ||
|
||
# we have this in a minutely crontab for simplicity, but we only want one to run | ||
if [ -f /run/.maybe-exit-spot-lock ] ; then | ||
echo "Already running maybe_exit_spot.sh" | ||
exit | ||
fi | ||
|
||
exec >/run/.maybe-exit-spot-log | ||
|
||
cleanup() { | ||
rm /run/.maybe-exit-spot-lock | ||
} | ||
|
||
trap cleanup EXIT | ||
touch /run/.maybe-exit-spot-lock | ||
|
||
# We wait to see if a runner comes up in | ||
while ! pgrep Runner.Worker > /dev/null; do | ||
if [ $elapsed_time -ge $MAX_WAIT_TIME ]; then | ||
echo "Found no runner for $MAX_WAIT_TIME, shutting down now." | ||
/run/spot_runner_graceful_exit.sh | ||
shutdown now | ||
exit | ||
fi | ||
|
||
sleep $WAIT_INTERVAL | ||
elapsed_time=$((elapsed_time + WAIT_INTERVAL)) | ||
done | ||
echo "System seems alive, doing nothing." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# Adapted from https://github.com/actions/actions-runner-controller/blob/master/runner/graceful-stop.sh | ||
#!/bin/bash | ||
|
||
set -eu | ||
|
||
export RUNNER_ALLOW_RUNASROOT=1 | ||
# This should be short so that the job is cancelled immediately, instead of hanging for 10 minutes or so and failing without any error message. | ||
RUNNER_GRACEFUL_STOP_TIMEOUT=${RUNNER_GRACEFUL_STOP_TIMEOUT:-15} | ||
|
||
echo "Executing graceful shutdown of github action runners." | ||
|
||
# The below procedure atomically removes the runner from GitHub Actions service, | ||
# to ensure that the runner is not running any job. | ||
# This is required to not terminate the actions runner agent while running the job. | ||
# If we didn't do this atomically, we might end up with a rare race where | ||
# the runner agent is terminated while it was about to start a job. | ||
|
||
# glob for all our installed runner directories | ||
for RUNNER_DIR in /run/*-ec2-* ; do | ||
pushd $RUNNER_DIR | ||
./config.sh remove --token "$(cat $RUNNER_DIR/.runner-token)" || true & | ||
popd | ||
done | ||
wait | ||
|
||
if pgrep Runner.Listener > /dev/null; then | ||
# The below procedure fixes the runner to correctly notify the Actions service for the cancellation of this runner. | ||
# It enables you to see `Error: The operation was canceled.` vs having it hang for 10 minutes or so. | ||
kill -TERM $(pgrep Runner.Listener) | ||
while pgrep Runner.Listener > /dev/null; do | ||
sleep 1 | ||
done | ||
fi | ||
echo "Cleaning up lingering runner registrations." | ||
for RUNNER_DIR in /run/*-ec2-* ; do | ||
pushd $RUNNER_DIR | ||
while [ -f .runner ] ; do | ||
./config.sh remove --token "$(cat $RUNNER_DIR/.runner-token)" || true | ||
sleep 1 | ||
done | ||
popd | ||
done | ||
echo "Graceful github runner stop completed." |