diff --git a/hack/runner/webhook/aws.go b/hack/runner/webhook/aws.go index 75c744f218..737ae48fd0 100644 --- a/hack/runner/webhook/aws.go +++ b/hack/runner/webhook/aws.go @@ -19,6 +19,11 @@ const ( defaultInstanceToTagDelay int = 3 ) +// Aws object +type Aws struct { + client *ec2.EC2 +} + // get github client func getAwsClientWithEnvToken() (*ec2.EC2, error) { // creds from env @@ -47,7 +52,22 @@ func getAwsClientWithEnvToken() (*ec2.EC2, error) { return client, nil } -func createEc2Runner(client *ec2.EC2, uniqueID, runnerToken string) (string, error) { +// NewAws generates a new Aws object +func NewAws() (*Aws, error) { + client, err := getAwsClientWithEnvToken() + if err != nil { + klog.Errorf("getAwsClientWithEnvToken failed. Err: %v\n", err) + return nil, err + } + + mya := &Aws{ + client: client, + } + return mya, nil +} + +// CreateEc2Runner creates a runner +func (a *Aws) CreateEc2Runner(uniqueID, runnerToken string) (string, error) { klog.V(6).Infof("uniqueID: %s\n", uniqueID) klog.V(6).Infof("runnerToken: %s\n", runnerToken) @@ -57,7 +77,7 @@ func createEc2Runner(client *ec2.EC2, uniqueID, runnerToken string) (string, err awsSubnet := os.Getenv("AWS_SUBNET") // Specify the details of the instance that you want to create. - runResult, err := client.RunInstances(&ec2.RunInstancesInput{ + runResult, err := a.client.RunInstances(&ec2.RunInstancesInput{ ImageId: aws.String(awsAmi), InstanceType: aws.String("t2.2xlarge"), MinCount: aws.Int64(1), @@ -78,7 +98,7 @@ func createEc2Runner(client *ec2.EC2, uniqueID, runnerToken string) (string, err time.Sleep(time.Duration(defaultInstanceToTagDelay) * time.Second) // Add tags to the created instance - _, err = client.CreateTags(&ec2.CreateTagsInput{ + _, err = a.client.CreateTags(&ec2.CreateTagsInput{ Resources: []*string{runResult.Instances[0].InstanceId}, Tags: []*ec2.Tag{ { @@ -94,7 +114,7 @@ func createEc2Runner(client *ec2.EC2, uniqueID, runnerToken string) (string, err if err != nil { klog.Errorf("CreateTags failed. Err: %v\n", err) - errDel := deleteEc2Instance(client, instanceID) + errDel := a.DeleteEc2Instance(instanceID) if errDel != nil { klog.Errorf("deleteEc2Instance failed. Err: %v\n", errDel) } @@ -103,13 +123,13 @@ func createEc2Runner(client *ec2.EC2, uniqueID, runnerToken string) (string, err } // wait for instance running - err = client.WaitUntilInstanceRunning(&ec2.DescribeInstancesInput{ + err = a.client.WaitUntilInstanceRunning(&ec2.DescribeInstancesInput{ InstanceIds: []*string{aws.String(instanceID)}, }) if err != nil { klog.Errorf("WaitUntilInstanceRunning failed. Err: %v\n", err) - errDel := deleteEc2Instance(client, instanceID) + errDel := a.DeleteEc2Instance(instanceID) if errDel != nil { klog.Errorf("deleteEc2Instance failed. Err: %v\n", errDel) } @@ -121,7 +141,8 @@ func createEc2Runner(client *ec2.EC2, uniqueID, runnerToken string) (string, err return instanceID, nil } -func deleteEc2InstanceByName(client *ec2.EC2, uniqueID string) error { +// DeleteEc2InstanceByName uhhh deletes an instance by name +func (a *Aws) DeleteEc2InstanceByName(uniqueID string) error { klog.Infof("deleteEc2InstanceByName(%s)\n", uniqueID) params := &ec2.DescribeInstancesInput{ @@ -135,7 +156,7 @@ func deleteEc2InstanceByName(client *ec2.EC2, uniqueID string) error { }, } - result, err := client.DescribeInstances(params) + result, err := a.client.DescribeInstances(params) if err != nil { klog.Errorf("DescribeInstances failed. Err: %v\n", err.Error()) return err @@ -151,7 +172,7 @@ func deleteEc2InstanceByName(client *ec2.EC2, uniqueID string) error { for _, instance := range reservation.Instances { klog.Infof("Attempt to Delete InstanceID: %s\n", *instance.InstanceId) - err := deleteEc2Instance(client, *instance.InstanceId) + err := a.DeleteEc2Instance(*instance.InstanceId) if err != nil { klog.Errorf("deleteEc2Instance failed. Err: %v\n", err) return err @@ -165,9 +186,10 @@ func deleteEc2InstanceByName(client *ec2.EC2, uniqueID string) error { return nil } -func deleteEc2Instance(client *ec2.EC2, instanceID string) error { +// DeleteEc2Instance uhhh deletes an instance by ID +func (a *Aws) DeleteEc2Instance(instanceID string) error { // Specify the details of the instance that you want to create. - _, err := client.TerminateInstances(&ec2.TerminateInstancesInput{ + _, err := a.client.TerminateInstances(&ec2.TerminateInstancesInput{ InstanceIds: []*string{aws.String(instanceID)}, }) if err != nil { diff --git a/hack/runner/webhook/github.go b/hack/runner/webhook/github.go index b505a75256..3ca995c151 100644 --- a/hack/runner/webhook/github.go +++ b/hack/runner/webhook/github.go @@ -16,7 +16,10 @@ import ( ) const ( + runnerIdle string = "idle" runnerOnline string = "online" + runnerActive string = "active" + // runnerOffline string = "offline" ) // Errors @@ -26,8 +29,16 @@ var ( // ErrRunnerOffline Runner is offline ErrRunnerOffline = errors.New("runner is offline") + + // ErrRunnerIsBusy Runner is busy + ErrRunnerIsBusy = errors.New("runner is busy") ) +// GitHub object +type GitHub struct { + client *github.Client +} + // get github client func getGitHubClientWithEnvToken() (*github.Client, error) { token := os.Getenv("GITHUB_TOKEN") @@ -53,14 +64,23 @@ func getGitHubClientWithEnvToken() (*github.Client, error) { return client, nil } -func createRunnerToken(client *github.Client) (string, error) { - if client == nil { - err := ErrClientInvalid - klog.Errorf("Client == nil. Err: %v\n", err) - return "", err +// NewGitHub generates a new GH object +func NewGitHub() (*GitHub, error) { + client, err := getGitHubClientWithEnvToken() + if err != nil { + klog.Errorf("getGitHubClientWithEnvToken failed. Err: %v\n", err) + return nil, err } - token, _, err := client.Actions.CreateRegistrationToken(context.Background(), "vmware-tanzu", "community-edition") + mygh := &GitHub{ + client: client, + } + return mygh, nil +} + +// CreateRunnerToken create a token +func (g *GitHub) CreateRunnerToken() (string, error) { + token, _, err := g.client.Actions.CreateRegistrationToken(context.Background(), "vmware-tanzu", "community-edition") if err != nil { klog.Errorf("Actions.CreateRegistrationToken returned Err: %v\n", err) return "", err @@ -70,17 +90,25 @@ func createRunnerToken(client *github.Client) (string, error) { return *token.Token, nil } -func getGitHubRunner(client *github.Client, runnerName string) (*github.Runner, error) { - klog.Infof("getGitHubRunner(%s)\n", runnerName) - - if client == nil { - err := ErrClientInvalid - klog.Errorf("Client == nil. Err: %v\n", err) +// GetGitHubRunners get all runner +func (g *GitHub) GetGitHubRunners() (*github.Runners, error) { + opts := &github.ListOptions{} + runners, _, err := g.client.Actions.ListRunners(context.Background(), "vmware-tanzu", "community-edition", opts) + if err != nil { + klog.Errorf("Actions.ListRunners failed. Err: %v\n", err) return nil, err } + klog.Infof("GetGitHubRunners succeeded!\n") + return runners, nil +} + +// GetGitHubRunner get runner +func (g *GitHub) GetGitHubRunner(runnerName string) (*github.Runner, error) { + klog.Infof("getGitHubRunner(%s)\n", runnerName) + opts := &github.ListOptions{} - runners, _, err := client.Actions.ListRunners(context.Background(), "vmware-tanzu", "community-edition", opts) + runners, _, err := g.client.Actions.ListRunners(context.Background(), "vmware-tanzu", "community-edition", opts) if err != nil { klog.Errorf("Actions.ListRunners failed. Err: %v\n", err) return nil, err @@ -103,39 +131,35 @@ func getGitHubRunner(client *github.Client, runnerName string) (*github.Runner, return nil, ErrRunnerOffline } -func deleteGitHubRunnerByName(client *github.Client, runnerName string) error { +// DeleteGitHubRunnerByName delete by name +func (g *GitHub) DeleteGitHubRunnerByName(runnerName string) error { klog.Infof("deleteGitHubRunnerByName(%s)\n", runnerName) - if client == nil { - err := ErrClientInvalid - klog.Errorf("Client == nil. Err: %v\n", err) - return err - } if runnerName == "" { err := ErrClientInvalid klog.Errorf("runnerName is empty. Err: %v\n", err) return err } - runner, err := getGitHubRunner(client, runnerName) + runner, err := g.GetGitHubRunner(runnerName) if err != nil { klog.Errorf("getGitHubRunner failed. Err: %v\n", err) return err } - return deleteGitHubRunnerByID(client, *runner.ID) + if *runner.Busy { + klog.Infof("Runner %s is working on another job\n", runnerName) + return ErrRunnerIsBusy + } + + return g.DeleteGitHubRunnerByID(*runner.ID) } -func deleteGitHubRunnerByID(client *github.Client, runnerID int64) error { +// DeleteGitHubRunnerByID delete by ID +func (g *GitHub) DeleteGitHubRunnerByID(runnerID int64) error { klog.Infof("deleteGitHubRunnerByID(%d)\n", runnerID) - if client == nil { - err := ErrClientInvalid - klog.Errorf("Client == nil. Err: %v\n", err) - return err - } - - _, err := client.Actions.RemoveRunner(context.Background(), "vmware-tanzu", "community-edition", runnerID) + _, err := g.client.Actions.RemoveRunner(context.Background(), "vmware-tanzu", "community-edition", runnerID) if err != nil { klog.Errorf("Actions.RemoveRunner failed. Err: %v\n", err) return err diff --git a/hack/runner/webhook/main.go b/hack/runner/webhook/main.go index a697d8c5f5..ee13b0c3e0 100644 --- a/hack/runner/webhook/main.go +++ b/hack/runner/webhook/main.go @@ -137,10 +137,7 @@ func checkAndDumpSettings() { } } -func main() { - initLogging() - checkAndDumpSettings() - +func initServer() { // envvars var port string if v := os.Getenv("LISTEN_PORT"); v != "" { @@ -190,6 +187,11 @@ func main() { } }) + // clean up + go func() { + backgroundClean() + }() + // generic version check http.HandleFunc(versionPath, func(w http.ResponseWriter, r *http.Request) { version := Version{ @@ -211,3 +213,9 @@ func main() { klog.Errorf("ListenAndServe failed. Err: %v\n", err) } } + +func main() { + initLogging() + checkAndDumpSettings() + initServer() +} diff --git a/hack/runner/webhook/server.go b/hack/runner/webhook/server.go index 8ef5e0e96a..84f182cc9d 100644 --- a/hack/runner/webhook/server.go +++ b/hack/runner/webhook/server.go @@ -15,13 +15,14 @@ import ( klog "k8s.io/klog/v2" - ec2 "github.com/aws/aws-sdk-go/service/ec2" webhook "github.com/go-playground/webhooks/v6/github" - github "github.com/google/go-github/v39/github" ) const ( - workflowJobInProgress string = "in_progress" + selfhostedRunnerPrefix string = "id-" + + // workflowJobQueued string = "queued" + // workflowJobInProgress string = "in_progress" workflowJobCompleted string = "completed" workflowJobSetupRunner string = "Start self-hosted EC2 runner" workflowJobTeardownRunner string = "Stop self-hosted EC2 runner" @@ -35,6 +36,7 @@ const ( defaultGetWorkflowRunTimeout int = 3 defaultGetWorkflowRunRetry int = 3 defaultGetWorkflowRunBetweenPoll int64 = 2 + defaultCleanUpOrphans int64 = 3600 ) // Errors @@ -43,14 +45,36 @@ var ( ErrCreateAndConnectRunner = errors.New("failed to create and connect the runner") ) -func createOnlineRunner(ghClient *github.Client, ec2Client *ec2.EC2, uniqueID string) error { - token, err := createRunnerToken(ghClient) +func createOnlineRunner(g *GitHub, a *Aws, uniqueID string) error { + klog.Infof("Check to see if %s runner exists...\n", uniqueID) + runner, err := g.GetGitHubRunner(uniqueID) + if err == nil { + klog.Infof("Runner %s exists. Status: %s\n", uniqueID, *runner.Status) + switch *runner.Status { + case runnerIdle: + klog.Infof("Runner %s is idle. Skip creation!\n", uniqueID) + return nil + + case runnerOnline: + klog.Infof("Runner %s is online. Skip creation!\n", uniqueID) + return nil + + case runnerActive: + klog.Infof("Runner %s is active. Skip creation!\n", uniqueID) + return nil + + default: + klog.Infof("Runner %s is %s. Continue with creation!\n", uniqueID, *runner.Status) + } + } + + token, err := g.CreateRunnerToken() if err != nil { klog.Errorf("getGitHubClientWithEnvToken failed. Err: %v\n", err) return err } - instanceID, err := createEc2Runner(ec2Client, uniqueID, token) + instanceID, err := a.CreateEc2Runner(uniqueID, token) if err != nil { klog.Errorf("createEc2Runner failed. Err: %v\n", err) return err @@ -61,7 +85,7 @@ func createOnlineRunner(ghClient *github.Client, ec2Client *ec2.EC2, uniqueID st succeeded := false for i := 0; i < defaultNumOfTimesToPoll; i++ { - runner, err := getGitHubRunner(ghClient, uniqueID) + runner, err := g.GetGitHubRunner(uniqueID) if err == nil { klog.Infof("Status: %s\n", *runner.Status) if !strings.EqualFold(*runner.Status, runnerOnline) && i > defaultmustHaveStatusBefore { @@ -86,7 +110,7 @@ func createOnlineRunner(ghClient *github.Client, ec2Client *ec2.EC2, uniqueID st if !succeeded { klog.Errorf("createOnlineRunner failed. Delete instance %s\n", instanceID) - err = deleteEc2Instance(ec2Client, instanceID) + err = a.DeleteEc2Instance(instanceID) if err != nil { klog.Errorf("deleteEc2Instance failed. Err: %v\n", err) } @@ -94,18 +118,19 @@ func createOnlineRunner(ghClient *github.Client, ec2Client *ec2.EC2, uniqueID st return ErrCreateAndConnectRunner } + klog.Infof("createOnlineRunner succeeded. instance: %s\n", instanceID) return nil } func createRunner(uniqueID string) error { - ghClient, err := getGitHubClientWithEnvToken() + ghClient, err := NewGitHub() if err != nil { - klog.Errorf("getGitHubClientWithEnvToken failed. Err: %v\n", err) + klog.Errorf("NewGitHub failed. Err: %v\n", err) return err } - ec2Client, err := getAwsClientWithEnvToken() + ec2Client, err := NewAws() if err != nil { - klog.Errorf("getAwsClientWithEnvToken failed. Err: %v\n", err) + klog.Errorf("NewAws failed. Err: %v\n", err) return err } @@ -123,25 +148,34 @@ func createRunner(uniqueID string) error { } func deleteRunner(uniqueID string) error { - ghClient, err := getGitHubClientWithEnvToken() + // insert small delay just like in create. this allows the runner agent time + // to become not busy + klog.Infof("Giving settling time...\n") + time.Sleep(time.Duration(defaultSleepHeadStart) * time.Second) + + ghClient, err := NewGitHub() if err != nil { - klog.Errorf("getGitHubClientWithEnvToken failed. Err: %v\n", err) + klog.Errorf("NewGitHub failed. Err: %v\n", err) return err } - ec2Client, err := getAwsClientWithEnvToken() + ec2Client, err := NewAws() if err != nil { - klog.Errorf("getAwsClientWithEnvToken failed. Err: %v\n", err) + klog.Errorf("NewAws failed. Err: %v\n", err) return err } - err = deleteGitHubRunnerByName(ghClient, uniqueID) + err = ghClient.DeleteGitHubRunnerByName(uniqueID) + if err == ErrRunnerIsBusy { + klog.Infof("Runner is busy working on other requests. Skipping deletion of runner and ec2 instance.\n") + return nil + } if err != nil { // Just a warning because of the new self host ephemeral // Do not error this function out because we need to delete the instance klog.Infof("deleteGitHubRunnerByName failed. Err: %v\n", err) } - err = deleteEc2InstanceByName(ec2Client, uniqueID) + err = ec2Client.DeleteEc2InstanceByName(uniqueID) if err != nil { klog.Errorf("deleteEc2InstanceByName failed. Err: %v\n", err) return err @@ -244,12 +278,15 @@ func handleWorkflowJob(workflowJob *webhook.WorkflowJobPayload) error { klog.V(6).Infof("---------------------- END DUMP EVENT ----------------------\n\n\n") workflowName := workflowJob.WorkflowJob.Name + klog.Infof("workflowName: %s, Action: %s\n", workflowName, workflowJob.Action) switch workflowName { case workflowJobSetupRunner: + klog.Infof("doWorkflowJob using create\n") return doWorkflowJob(workflowJob, true) case workflowJobTeardownRunner: + klog.Infof("doWorkflowJob using delete\n") return doWorkflowJob(workflowJob, false) default: @@ -259,12 +296,11 @@ func handleWorkflowJob(workflowJob *webhook.WorkflowJobPayload) error { } func doWorkflowJob(workflowJob *webhook.WorkflowJobPayload, create bool) error { - if (create && !strings.EqualFold(workflowJob.Action, workflowJobInProgress)) || + if (create && !strings.EqualFold(workflowJob.Action, workflowJobCompleted)) || (!create && !strings.EqualFold(workflowJob.Action, workflowJobCompleted)) { klog.Infof("doWorkflowJob create: %t, status %s. Skipping!\n", create, workflowJob.Action) return nil } - klog.Infof("doWorkflowJob using create %t\n", create) // get the WorkflowRun which represents the entire workflow end-to-end workflowRun, err := getWorkflowRun(workflowJob.WorkflowJob.RunURL) @@ -305,3 +341,60 @@ func handleWorkflowRun(workflowRun *webhook.WorkflowRunPayload) { klog.V(6).Infof("%+v\n\n\n", workflowRun) klog.V(6).Infof("---------------------- END DUMP EVENT ----------------------\n\n\n") } + +func backgroundClean() { + // while true loop + for { + time.Sleep(time.Duration(defaultCleanUpOrphans) * time.Second) + klog.V(4).Infof("Do routine clean up check\n") + + ghClient, err := NewGitHub() + if err != nil { + klog.Errorf("NewGitHub failed. Err: %v\n", err) + continue + } + ec2Client, err := NewAws() + if err != nil { + klog.Errorf("NewAws failed. Err: %v\n", err) + continue + } + + runners, err := ghClient.GetGitHubRunners() + if err != nil { + klog.Errorf("GetGitHubRunners failed. Err: %v\n", err) + continue + } + + if runners.TotalCount == 0 { + klog.V(4).Infof("runners.TotalCount == 0. Skip!\n") + continue + } + + for _, runner := range runners.Runners { + runnerName := *runner.Name + runnerStatus := *runner.Status + klog.Infof("Runner: %s\n", runnerName) + + if !strings.Contains(selfhostedRunnerPrefix, runnerName) { + klog.Infof("Skipping Runner: %s\n", runnerName) + continue + } + + klog.V(4).Infof("Runner %s Status: %s\n", runnerName, runnerStatus) + switch runnerStatus { + case runnerIdle: + err := ghClient.DeleteGitHubRunnerByName(runnerName) + if err != nil { + klog.Errorf("DeleteGitHubRunnerByName(%s) failed. Err: %v\n", runnerName, err) + } + err = ec2Client.DeleteEc2InstanceByName(runnerName) + if err != nil { + klog.Errorf("DeleteEc2InstanceByName(%s) failed. Err: %v\n", runnerName, err) + } + + default: + klog.V(4).Infof("Runner %s is %s. Is active!\n", runnerName, runnerStatus) + } + } + } +}