Skip to content

Commit 094e40b

Browse files
Add jitter to ExponentialBackoff (#4476)
* add backoff jitter, tests and env variable * increase default ERROR_RETRY_CEIL to 60 minutes
1 parent 1b0031a commit 094e40b

File tree

3 files changed

+51
-4
lines changed

3 files changed

+51
-4
lines changed

core/src/subgraph/runner.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,10 @@ where
6060
should_try_unfail_non_deterministic: true,
6161
synced: false,
6262
skip_ptr_updates_timer: Instant::now(),
63-
backoff: ExponentialBackoff::new(
63+
backoff: ExponentialBackoff::with_jitter(
6464
(MINUTE * 2).min(env_vars.subgraph_error_retry_ceil),
6565
env_vars.subgraph_error_retry_ceil,
66+
env_vars.subgraph_error_retry_jitter,
6667
),
6768
entity_lfu_cache: LfuCache::new(),
6869
},

graph/src/env/mod.rs

+10-2
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,13 @@ pub struct EnvVars {
133133
/// Ceiling for the backoff retry of non-deterministic errors.
134134
///
135135
/// Set by the environment variable `GRAPH_SUBGRAPH_ERROR_RETRY_CEIL_SECS`
136-
/// (expressed in seconds). The default value is 1800s (30 minutes).
136+
/// (expressed in seconds). The default value is 3600s (60 minutes).
137137
pub subgraph_error_retry_ceil: Duration,
138+
/// Jitter factor for the backoff retry of non-deterministic errors.
139+
///
140+
/// Set by the environment variable `GRAPH_SUBGRAPH_ERROR_RETRY_JITTER`
141+
/// (clamped between 0.0 and 1.0). The default value is 0.2.
142+
pub subgraph_error_retry_jitter: f64,
138143
/// Experimental feature.
139144
///
140145
/// Set by the flag `GRAPH_ENABLE_SELECT_BY_SPECIFIC_ATTRIBUTES`. Off by
@@ -210,6 +215,7 @@ impl EnvVars {
210215
subgraph_max_data_sources: inner.subgraph_max_data_sources.0,
211216
disable_fail_fast: inner.disable_fail_fast.0,
212217
subgraph_error_retry_ceil: Duration::from_secs(inner.subgraph_error_retry_ceil_in_secs),
218+
subgraph_error_retry_jitter: inner.subgraph_error_retry_jitter,
213219
enable_select_by_specific_attributes: inner.enable_select_by_specific_attributes.0,
214220
log_trigger_data: inner.log_trigger_data.0,
215221
explorer_ttl: Duration::from_secs(inner.explorer_ttl_in_secs),
@@ -313,8 +319,10 @@ struct Inner {
313319
subgraph_max_data_sources: NoUnderscores<usize>,
314320
#[envconfig(from = "GRAPH_DISABLE_FAIL_FAST", default = "false")]
315321
disable_fail_fast: EnvVarBoolean,
316-
#[envconfig(from = "GRAPH_SUBGRAPH_ERROR_RETRY_CEIL_SECS", default = "1800")]
322+
#[envconfig(from = "GRAPH_SUBGRAPH_ERROR_RETRY_CEIL_SECS", default = "3600")]
317323
subgraph_error_retry_ceil_in_secs: u64,
324+
#[envconfig(from = "GRAPH_SUBGRAPH_ERROR_RETRY_JITTER", default = "0.2")]
325+
subgraph_error_retry_jitter: f64,
318326
#[envconfig(from = "GRAPH_ENABLE_SELECT_BY_SPECIFIC_ATTRIBUTES", default = "false")]
319327
enable_select_by_specific_attributes: EnvVarBoolean,
320328
#[envconfig(from = "GRAPH_LOG_TRIGGER_DATA", default = "false")]

graph/src/util/backoff.rs

+39-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ pub struct ExponentialBackoff {
88
pub attempt: u64,
99
base: Duration,
1010
ceiling: Duration,
11+
jitter: f64,
1112
}
1213

1314
impl ExponentialBackoff {
@@ -16,6 +17,19 @@ impl ExponentialBackoff {
1617
attempt: 0,
1718
base,
1819
ceiling,
20+
jitter: 0.0,
21+
}
22+
}
23+
24+
// Create ExponentialBackoff with jitter
25+
// jitter is a value between 0.0 and 1.0. Sleep delay will be randomized
26+
// within `jitter` of the normal sleep delay
27+
pub fn with_jitter(base: Duration, ceiling: Duration, jitter: f64) -> Self {
28+
ExponentialBackoff {
29+
attempt: 0,
30+
base,
31+
ceiling,
32+
jitter: jitter.clamp(0.0, 1.0),
1933
}
2034
}
2135

@@ -37,7 +51,8 @@ impl ExponentialBackoff {
3751
if delay > self.ceiling {
3852
delay = self.ceiling;
3953
}
40-
delay
54+
let jitter = rand::Rng::gen_range(&mut rand::thread_rng(), -self.jitter..=self.jitter);
55+
delay.mul_f64(1.0 + jitter)
4156
}
4257

4358
fn next_attempt(&mut self) -> Duration {
@@ -80,6 +95,29 @@ mod tests {
8095
assert_eq!(backoff.next_attempt(), Duration::from_secs(5));
8196
}
8297

98+
#[test]
99+
fn test_delay_with_jitter() {
100+
let mut backoff = ExponentialBackoff::with_jitter(
101+
Duration::from_millis(1000),
102+
Duration::from_secs(5),
103+
0.1,
104+
);
105+
106+
// Delay should be between 0.5s and 1.5s
107+
let delay1 = backoff.delay();
108+
assert!(delay1 > Duration::from_millis(900) && delay1 <= Duration::from_millis(1100));
109+
let delay2 = backoff.delay();
110+
assert!(delay2 > Duration::from_millis(900) && delay2 <= Duration::from_millis(1100));
111+
112+
// Delays should be random and different
113+
assert_ne!(delay1, delay2);
114+
115+
// Test ceiling
116+
backoff.attempt = 123456;
117+
let delay = backoff.delay();
118+
assert!(delay > Duration::from_millis(4500) && delay <= Duration::from_millis(5500));
119+
}
120+
83121
#[test]
84122
fn test_overflow_delay() {
85123
let mut backoff =

0 commit comments

Comments
 (0)