From fe553f5c05851fc547eff4803959d968446588b4 Mon Sep 17 00:00:00 2001 From: motatoes Date: Tue, 17 Sep 2024 12:55:19 +0100 Subject: [PATCH 1/2] add failure-reason when runs fail --- next/model/digger_runs.gen.go | 1 + next/models_generated/digger_runs.gen.go | 6 +- next/services/runs.go | 109 ++++++++++++++++++++++- 3 files changed, 112 insertions(+), 4 deletions(-) diff --git a/next/model/digger_runs.gen.go b/next/model/digger_runs.gen.go index e87d35e8e..8363d431e 100644 --- a/next/model/digger_runs.gen.go +++ b/next/model/digger_runs.gen.go @@ -37,6 +37,7 @@ type DiggerRun struct { ApplyLogs string `gorm:"column:apply_logs" json:"apply_logs"` ApproverUserID *string `gorm:"column:approver_user_id" json:"approver_user_id"` TriggeredByUserID *string `gorm:"column:triggered_by_user_id" json:"triggered_by_user_id"` + FailureReason string `gorm:"column:failure_reason" json:"failure_reason"` } // TableName DiggerRun's table name diff --git a/next/models_generated/digger_runs.gen.go b/next/models_generated/digger_runs.gen.go index 52af5527c..556bb21fb 100644 --- a/next/models_generated/digger_runs.gen.go +++ b/next/models_generated/digger_runs.gen.go @@ -50,6 +50,7 @@ func newDiggerRun(db *gorm.DB, opts ...gen.DOOption) diggerRun { _diggerRun.ApplyLogs = field.NewString(tableName, "apply_logs") _diggerRun.ApproverUserID = field.NewString(tableName, "approver_user_id") _diggerRun.TriggeredByUserID = field.NewString(tableName, "triggered_by_user_id") + _diggerRun.FailureReason = field.NewString(tableName, "failure_reason") _diggerRun.fillFieldMap() @@ -83,6 +84,7 @@ type diggerRun struct { ApplyLogs field.String ApproverUserID field.String TriggeredByUserID field.String + FailureReason field.String fieldMap map[string]field.Expr } @@ -122,6 +124,7 @@ func (d *diggerRun) updateTableName(table string) *diggerRun { d.ApplyLogs = field.NewString(table, "apply_logs") d.ApproverUserID = field.NewString(table, "approver_user_id") d.TriggeredByUserID = field.NewString(table, "triggered_by_user_id") + d.FailureReason = field.NewString(table, "failure_reason") d.fillFieldMap() @@ -138,7 +141,7 @@ func (d *diggerRun) GetFieldByName(fieldName string) (field.OrderExpr, bool) { } func (d *diggerRun) fillFieldMap() { - d.fieldMap = make(map[string]field.Expr, 23) + d.fieldMap = make(map[string]field.Expr, 24) d.fieldMap["id"] = d.ID d.fieldMap["created_at"] = d.CreatedAt d.fieldMap["updated_at"] = d.UpdatedAt @@ -162,6 +165,7 @@ func (d *diggerRun) fillFieldMap() { d.fieldMap["apply_logs"] = d.ApplyLogs d.fieldMap["approver_user_id"] = d.ApproverUserID d.fieldMap["triggered_by_user_id"] = d.TriggeredByUserID + d.fieldMap["failure_reason"] = d.FailureReason } func (d diggerRun) clone(db *gorm.DB) diggerRun { diff --git a/next/services/runs.go b/next/services/runs.go index 4ebc1005d..9bce16d5f 100644 --- a/next/services/runs.go +++ b/next/services/runs.go @@ -39,12 +39,24 @@ func RunQueuesStateMachine(queueItem *model.DiggerRunQueueItem, service ci.PullR runName, err := GetRunNameFromJob(*planJob) if err != nil { log.Printf("could not get run name: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "Could not load run name" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not get run name: %v", err) } err = RefreshVariableSpecForJob(planJob) if err != nil { log.Printf("could not get variable spec from job: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "Could not load variables" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not get variable spec from job: %v", err) } @@ -53,28 +65,64 @@ func RunQueuesStateMachine(queueItem *model.DiggerRunQueueItem, service ci.PullR err = RefreshVariableSpecForJob(applyJob) if err != nil { log.Printf("could not get variable spec from job: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not load variables" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not get variable spec from job: %v", err) } spec, err := GetSpecFromJob(*planJob) if err != nil { log.Printf("could not get spec: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not prepare job spec for triggering" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not get spec: %v", err) } vcsToken, err := GetVCSTokenFromJob(*planJob, gh) if err != nil { log.Printf("could not get vcs token: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not fetch VCS token (hint: is your app installed for repo?)" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } + return fmt.Errorf("could not get vcs token: %v", err) } err = dbmodels.DB.RefreshDiggerJobTokenExpiry(planJob) if err != nil { log.Printf("could not refresh job token expiry: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not refresh digger token (likely an internal error)" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not refresh job token from expiry: %v", err) } - ciBackend.TriggerWorkflow(*spec, *runName, *vcsToken) + err = ciBackend.TriggerWorkflow(*spec, *runName, *vcsToken) + if err != nil { + log.Printf("ERROR: Failed to trigger for Digger Run queueID: %v [%v %v]", queueItem.ID, queueItem.DiggerRunID, dr.ProjectName) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = fmt.Sprintf("could not trigger workflow, internal error: %v", err) + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } + + return fmt.Errorf("ERROR: Failed to trigger for Digger Run queueID: %v [%v %v]", queueItem.ID, queueItem.DiggerRunID, dr.ProjectName) + } // change status to RunPendingPlan log.Printf("Updating run queueItem item to planning state") @@ -89,6 +137,12 @@ func RunQueuesStateMachine(queueItem *model.DiggerRunQueueItem, service ci.PullR batch, err := dbmodels.DB.GetDiggerBatch(planStage.BatchID) if err != nil { log.Printf("could not get plan batch: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not find digger batch" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not get plan batch: %v", err) } batchStatus := batch.Status @@ -97,6 +151,7 @@ func RunQueuesStateMachine(queueItem *model.DiggerRunQueueItem, service ci.PullR // if failed then go straight to failed if batchStatus == int16(orchestrator_scheduler.BatchJobFailed) { dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "The job failed to run, please check action logs for more details" err := dbmodels.DB.UpdateDiggerRun(dr) if err != nil { log.Printf("ERROR: Failed to update Digger Run for queueID: %v [%v %v]", queueItem.ID, queueItem.DiggerRunID, dr.ProjectName) @@ -136,34 +191,75 @@ func RunQueuesStateMachine(queueItem *model.DiggerRunQueueItem, service ci.PullR client := service.(*github.GithubService).Client ciBackend := ci_backends.GithubActionCi{Client: client} if err != nil { - log.Printf("could not get run name: %v", err) + log.Printf("could not get job: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not get job from run stage" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not get run name: %v", err) } runName, err := GetRunNameFromJob(*job) if err != nil { log.Printf("could not get run name: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not get run name" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not get run name: %v", err) } spec, err := GetSpecFromJob(*job) if err != nil { log.Printf("could not get spec: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could get spec from job" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not get spec: %v", err) } vcsToken, err := GetVCSTokenFromJob(*job, gh) if err != nil { log.Printf("could not get vcs token: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not fetch vcs token (hint: is the app still installed?)" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } + return fmt.Errorf("could not get spec: %v", err) } err = dbmodels.DB.RefreshDiggerJobTokenExpiry(job) if err != nil { log.Printf("could not refresh job token expiry: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not refresh expiry token" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not refresh job token from expiry: %v", err) } - ciBackend.TriggerWorkflow(*spec, *runName, *vcsToken) + err = ciBackend.TriggerWorkflow(*spec, *runName, *vcsToken) + if err != nil { + log.Printf("could not trigger workflow for apply queueItem: %v [%v %v]", queueItem.ID, queueItem.DiggerRunID, dr.ProjectName) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = fmt.Sprintf("could not trigger workflow: %v", err) + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } + return fmt.Errorf("ERROR: failed to trigger workflow: %v", err) + } dr.Status = string(dbmodels.RunApplying) err = dbmodels.DB.UpdateDiggerRun(dr) @@ -177,6 +273,12 @@ func RunQueuesStateMachine(queueItem *model.DiggerRunQueueItem, service ci.PullR batch, err := dbmodels.DB.GetDiggerBatch(applyStage.BatchID) if err != nil { log.Printf("could not get apply batch: %v", err) + dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "could not get apply batch" + err = dbmodels.DB.UpdateDiggerRun(dr) + if err != nil { + log.Printf("Error: could not update digger status to failed: %v", err) + } return fmt.Errorf("could not get apply batch: %v", err) } batchStatus := batch.Status @@ -184,6 +286,7 @@ func RunQueuesStateMachine(queueItem *model.DiggerRunQueueItem, service ci.PullR // if failed then go straight to failed if batchStatus == int16(orchestrator_scheduler.BatchJobFailed) { dr.Status = string(dbmodels.RunFailed) + dr.FailureReason = "the job failed to run, please refer to action logs for details" err := dbmodels.DB.UpdateDiggerRun(dr) if err != nil { log.Printf("ERROR: Failed to update Digger Run for queueID: %v [%v %v]", queueItem.ID, queueItem.DiggerRunID, dr.ProjectName) From 77f2a413d0dc28f4f67da05799ca62dfa8646094 Mon Sep 17 00:00:00 2001 From: motatoes Date: Tue, 17 Sep 2024 12:55:45 +0100 Subject: [PATCH 2/2] deploy --- .github/workflows/next_deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/next_deploy.yml b/.github/workflows/next_deploy.yml index 1cbf6db15..d23c37d2e 100644 --- a/.github/workflows/next_deploy.yml +++ b/.github/workflows/next_deploy.yml @@ -3,7 +3,7 @@ on: push: branches: - develop # change to main if needed - - feat/drift-runs-improvement-name + - feat/add-failure-reason-when-run-fails jobs: deploy: name: Deploy app