From 55f6542065d904c92a0a4811e0f667ae0bb40b67 Mon Sep 17 00:00:00 2001 From: Ben Moskovitz Date: Thu, 9 Jan 2025 13:48:04 +1100 Subject: [PATCH] Switch Job Acquire to use expo backoff, use jitter --- agent/agent_worker_test.go | 4 ++-- core/client.go | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/agent/agent_worker_test.go b/agent/agent_worker_test.go index 9c87589e77..b9fb208750 100644 --- a/agent/agent_worker_test.go +++ b/agent/agent_worker_test.go @@ -218,8 +218,8 @@ func TestAcquireAndRunJobWaiting(t *testing.T) { } // the last Retry-After is not recorded as the retries loop exits before using it - expectedSleeps := make([]time.Duration, 0, 9) - for d := 1; d <= 1<<8; d *= 2 { + expectedSleeps := make([]time.Duration, 0, 6) + for d := 1; d <= 1<<5; d *= 2 { expectedSleeps = append(expectedSleeps, time.Duration(d)*time.Second) } assert.Equal(t, expectedSleeps, retrySleeps) diff --git a/core/client.go b/core/client.go index 6edf1bc542..5932a95224 100644 --- a/core/client.go +++ b/core/client.go @@ -56,19 +56,20 @@ func (c *Client) AcquireJob(ctx context.Context, jobID string) (*api.Job, error) // large if the job is in the waiting state. // // If there were no delays or jitter, the attempts would happen at t = 0, 1, 2, 4, ..., 128s - // after the initial one. Therefore, there are 9 attempts taking at least 255s. If the jitter - // always hit the max of 1s, then another 8s is added to that. This is still comfortably within - // the timeout of 270s, and the bound seems tight enough so that the agent is not wasting time + // after the initial one. Therefore, there are 7 attempts taking at least 255s. If the jitter + // always hit the max of 5s, then another 40s is added to that. This is still comfortably within + // the timeout of 330s, and the bound seems tight enough so that the agent is not wasting time // waiting for a retry that will never happen. - timeoutCtx, cancel := context.WithTimeout(ctx, 270*time.Second) + timeoutCtx, cancel := context.WithTimeout(ctx, 330*time.Second) defer cancel() // Acquire the job using the ID we were provided. We'll retry as best we can on non 422 error. // Except for 423 errors, in which we exponentially back off under the direction of the API // setting the Retry-After header r := roko.NewRetrier( - roko.WithMaxAttempts(10), - roko.WithStrategy(roko.Constant(3*time.Second)), + roko.WithMaxAttempts(7), + roko.WithStrategy(roko.Exponential(2*time.Second, 0)), + roko.WithJitterRange(-time.Second, 5*time.Second), roko.WithSleepFunc(c.RetrySleepFunc), )