Skip to content

Commit

Permalink
Switch Job Acquire to use expo backoff, use jitter
Browse files Browse the repository at this point in the history
  • Loading branch information
moskyb committed Jan 9, 2025
1 parent 9f57ea3 commit 55f6542
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 8 deletions.
4 changes: 2 additions & 2 deletions agent/agent_worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,8 @@ func TestAcquireAndRunJobWaiting(t *testing.T) {
}

// the last Retry-After is not recorded as the retries loop exits before using it
expectedSleeps := make([]time.Duration, 0, 9)
for d := 1; d <= 1<<8; d *= 2 {
expectedSleeps := make([]time.Duration, 0, 6)
for d := 1; d <= 1<<5; d *= 2 {
expectedSleeps = append(expectedSleeps, time.Duration(d)*time.Second)
}
assert.Equal(t, expectedSleeps, retrySleeps)
Expand Down
13 changes: 7 additions & 6 deletions core/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,20 @@ func (c *Client) AcquireJob(ctx context.Context, jobID string) (*api.Job, error)
// large if the job is in the waiting state.
//
// If there were no delays or jitter, the attempts would happen at t = 0, 1, 2, 4, ..., 128s
// after the initial one. Therefore, there are 9 attempts taking at least 255s. If the jitter
// always hit the max of 1s, then another 8s is added to that. This is still comfortably within
// the timeout of 270s, and the bound seems tight enough so that the agent is not wasting time
// after the initial one. Therefore, there are 7 attempts taking at least 255s. If the jitter
// always hit the max of 5s, then another 40s is added to that. This is still comfortably within
// the timeout of 330s, and the bound seems tight enough so that the agent is not wasting time
// waiting for a retry that will never happen.
timeoutCtx, cancel := context.WithTimeout(ctx, 270*time.Second)
timeoutCtx, cancel := context.WithTimeout(ctx, 330*time.Second)
defer cancel()

// Acquire the job using the ID we were provided. We'll retry as best we can on non 422 error.
// Except for 423 errors, in which we exponentially back off under the direction of the API
// setting the Retry-After header
r := roko.NewRetrier(
roko.WithMaxAttempts(10),
roko.WithStrategy(roko.Constant(3*time.Second)),
roko.WithMaxAttempts(7),
roko.WithStrategy(roko.Exponential(2*time.Second, 0)),
roko.WithJitterRange(-time.Second, 5*time.Second),
roko.WithSleepFunc(c.RetrySleepFunc),
)

Expand Down

0 comments on commit 55f6542

Please sign in to comment.