From 568a68dba48c9596ea152267d9f4a642f111aa0e Mon Sep 17 00:00:00 2001 From: Jeff Wolski Date: Mon, 19 Oct 2015 19:10:29 +0200 Subject: [PATCH 1/6] Join duration w/ backoff --- config.js | 7 +- index.js | 5 +- lib/gossip/{join-sender.js => joiner.js} | 121 ++++++++---------- server/admin/member.js | 2 +- .../{join-sender-test.js => joiner_test.js} | 2 +- 5 files changed, 63 insertions(+), 74 deletions(-) rename lib/gossip/{join-sender.js => joiner.js} (83%) rename test/unit/{join-sender-test.js => joiner_test.js} (98%) diff --git a/config.js b/config.js index 8ff5ae0a..63347d21 100644 --- a/config.js +++ b/config.js @@ -82,7 +82,12 @@ Config.prototype._seed = function _seed(seed) { seedOrDefault('dampScoringReuseLimit', 2500); seedOrDefault('dampScoringSuppressDuration', 60 * 60 * 1000); // 1 hr in ms seedOrDefault('dampScoringSuppressLimit', 5000); - seedOrDefault('maxJoinAttempts', 50, numValidator); + + // Joiner config + seedOrDefault('joinDelayMin', 100, numValidator); // ms + seedOrDefault('joinDelayMax', 2 * 60 * 1000, numValidator); // 2 min in ms + seedOrDefault('maxJoinDuration', 20 * 60 * 1000, numValidator); // 20 mins in ms + seedOrDefault('memberBlacklist', [], function validator(vals) { return _.all(vals, function all(val) { return val instanceof RegExp; diff --git a/index.js b/index.js index 366fc120..37ad844d 100644 --- a/index.js +++ b/index.js @@ -61,11 +61,10 @@ var registerRingpopListeners = require('./lib/on_ringpop_event.js').register; var RingpopClient = require('./client.js'); var RingpopServer = require('./server'); var safeParse = require('./lib/util').safeParse; -var sendJoin = require('./lib/gossip/join-sender.js').joinCluster; +var sendJoin = require('./lib/gossip/joiner.js').joinCluster; var TracerStore = require('./lib/trace/store.js'); var HOST_PORT_PATTERN = /^(\d+.\d+.\d+.\d+):\d+$/; -var MAX_JOIN_DURATION = 300000; var MEMBERSHIP_UPDATE_FLUSH_INTERVAL = 5000; function RingPop(options) { @@ -115,7 +114,6 @@ function RingPop(options) { this.pingTimeout = options.pingTimeout || 1500; this.joinTimeout = options.joinTimeout || 1000; this.proxyReqTimeout = options.proxyReqTimeout || 30000; - this.maxJoinDuration = options.maxJoinDuration || MAX_JOIN_DURATION; this.membershipUpdateFlushInterval = options.membershipUpdateFlushInterval || MEMBERSHIP_UPDATE_FLUSH_INTERVAL; @@ -282,7 +280,6 @@ RingPop.prototype.bootstrap = function bootstrap(opts, callback) { sendJoin({ ringpop: self, - maxJoinDuration: self.maxJoinDuration, joinSize: self.joinSize, parallelismFactor: opts.joinParallelismFactor, joinTimeout: self.joinTimeout diff --git a/lib/gossip/join-sender.js b/lib/gossip/joiner.js similarity index 83% rename from lib/gossip/join-sender.js rename to lib/gossip/joiner.js index d9389945..9993dd34 100644 --- a/lib/gossip/join-sender.js +++ b/lib/gossip/joiner.js @@ -21,6 +21,7 @@ var captureHost = require('../util.js').captureHost; var errors = require('../errors.js'); +var globalTimers = require('timers'); var isEmptyArray = require('../util.js').isEmptyArray; var mergeJoinResponses = require('./join-response-merge.js'); var numOrDefault = require('../util.js').numOrDefault; @@ -40,29 +41,8 @@ var JoinDurationExceededError = TypedError({ max: null }); -var JoinAttemptsExceededError = TypedError({ - type: 'ringpop.join-attempts-exceeded', - message: 'Join attempts of `{joinAttempts}` ' + - 'exceeded max `{maxJoinAttempts}`.\n', - joinAttempts: null, - maxJoinAttempts: null -}); - -var JOIN_RETRY_DELAY = 100; var JOIN_SIZE = 3; var JOIN_TIMEOUT = 1000; -// If a node cannot complete a join within MAX_JOIN_DURATION -// there is likely something very wrong. The aim is for the join -// operation to take no more than 1s, under normal conditions. -// -// The duration assigned below is very high for the following -// purposes: -// - Gives an application developer some time to diagnose -// what could be wrong. -// - Gives an operator some time to bootstrap a newly -// provisioned cluster -// - Trying forever is futile -var MAX_JOIN_DURATION = 120000; var PARALLELISM_FACTOR = 2; function isSingleNodeCluster(ringpop) { @@ -79,7 +59,7 @@ function takeNode(hosts) { return host; } -function JoinCluster(opts) { +function Joiner(opts) { opts = opts || {}; if (!opts.ringpop) { @@ -97,6 +77,7 @@ function JoinCluster(opts) { } this.ringpop = opts.ringpop; + this.timers = opts.timers || globalTimers; this.host = captureHost(this.ringpop.hostPort); this.joinTimeout = numOrDefault(opts.joinTimeout, JOIN_TIMEOUT); @@ -107,18 +88,6 @@ function JoinCluster(opts) { this.parallelismFactor = numOrDefault(opts.parallelismFactor, PARALLELISM_FACTOR); - // We eventually want to give up if the join process cannot - // succeed. `maxJoinDuration` is used to restrict that process - // to a certain time limit. - this.maxJoinDuration = numOrDefault(opts.maxJoinDuration, - MAX_JOIN_DURATION); - - // We do not want to retry joining as hard as we can. We - // want to have some fixed backoff applied before we try - // to join again - this.joinRetryDelay = numOrDefault(opts.joinRetryDelay, - JOIN_RETRY_DELAY); - // Potential nodes are nodes in the ringpop bootstrap // list that can be joined. Upon instantiation, this step // simply filters out a node from attempting to join itself. @@ -144,10 +113,13 @@ function JoinCluster(opts) { // Changes received by other nodes will be aggregated and // applied once the join process is complete. this.joinResponses = []; + + this.joinDelay = this.ringpop.config.get('joinDelayMin'); + this.joinRetries = 0; } // Potential nodes are those that are not this instance of ringpop. -JoinCluster.prototype.collectPotentialNodes = function collectPotentialNodes(nodesJoined) { +Joiner.prototype.collectPotentialNodes = function collectPotentialNodes(nodesJoined) { nodesJoined = nodesJoined || []; var self = this; @@ -157,7 +129,7 @@ JoinCluster.prototype.collectPotentialNodes = function collectPotentialNodes(nod }; // Preferred nodes are those that are not on the same host as this instance of ringpop. -JoinCluster.prototype.collectPreferredNodes = function collectPreferredNodes() { +Joiner.prototype.collectPreferredNodes = function collectPreferredNodes() { var self = this; return this.potentialNodes.filter(function filterHost(hostPort) { return self.host !== captureHost(hostPort); @@ -165,7 +137,7 @@ JoinCluster.prototype.collectPreferredNodes = function collectPreferredNodes() { }; // Non-preferred nodes are everyone else. -JoinCluster.prototype.collectNonPreferredNodes = function collectNonPreferredNodes() { +Joiner.prototype.collectNonPreferredNodes = function collectNonPreferredNodes() { var self = this; if (isEmptyArray(this.preferredNodes)) { @@ -177,7 +149,7 @@ JoinCluster.prototype.collectNonPreferredNodes = function collectNonPreferredNod } }; -JoinCluster.prototype.init = function init(nodesJoined) { +Joiner.prototype.init = function init(nodesJoined) { // TODO The "collect" operations are fairly inefficient. This // can be improved by indexing by host/port values. this.potentialNodes = this.collectPotentialNodes(nodesJoined); @@ -191,7 +163,7 @@ JoinCluster.prototype.init = function init(nodesJoined) { this.roundNonPreferredNodes = this.nonPreferredNodes.slice(0); }; -JoinCluster.prototype.join = function join(callback) { +Joiner.prototype.join = function join(callback) { var self = this; if (this.ringpop.destroyed) { @@ -221,7 +193,6 @@ JoinCluster.prototype.join = function join(callback) { var numFailed = 0; var startTime = Date.now(); var calledBack = false; - var maxJoinAttempts = this.ringpop.config.get('maxJoinAttempts'); function onJoin(err, nodes) { if (calledBack) { @@ -271,29 +242,14 @@ JoinCluster.prototype.join = function join(callback) { calledBack = true; callback(null, nodesJoined); - } else if (numFailed >= maxJoinAttempts) { - self.ringpop.logger.warn('ringpop max join attempts exceeded', { - local: self.ringpop.whoami(), - joinAttempts: numFailed, - maxJoinAttempts: maxJoinAttempts, - numJoined: numJoined, - numFailed: numFailed, - startTime: startTime - }); - - calledBack = true; - callback(JoinAttemptsExceededError({ - joinAttempts: numFailed, - maxJoinAttempts: maxJoinAttempts - })); - return; } else { var joinDuration = Date.now() - startTime; - if (joinDuration > self.maxJoinDuration) { + var maxJoinDuration = self.ringpop.config.get('maxJoinDuration'); + if (joinDuration > maxJoinDuration) { self.ringpop.logger.warn('ringpop max join duration exceeded', { local: self.ringpop.whoami(), joinDuration: joinDuration, - maxJoinDuration: self.maxJoinDuration, + maxJoinDuration: maxJoinDuration, numJoined: numJoined, numFailed: numFailed, startTime: startTime @@ -302,7 +258,7 @@ JoinCluster.prototype.join = function join(callback) { calledBack = true; callback(JoinDurationExceededError({ joinDuration: joinDuration, - maxJoinDuration: self.maxJoinDuration + maxJoinDuration: maxJoinDuration })); return; } @@ -315,18 +271,49 @@ JoinCluster.prototype.join = function join(callback) { numNodesLeft: self.joinSize - numJoined }); - setTimeout(reJoin, self.joinRetryDelay); - } + var oldJoinDelay = self.joinDelay; + var delayMax = self.ringpop.config.get('joinDelayMax'); + var delayMin = self.ringpop.config.get('joinDelayMin'); + self.joinDelay = Math.min(delayMax, delayMin * Math.pow(2, + self.joinRetries)); + + // Determine if the join delay has exceeded the maximum + // delay and send out a warning letting developer's know + // that Ringpop is having trouble. + if (oldJoinDelay < delayMax && + self.joinDelay >= delayMax) { + var errorMsg = 'ringpop joiner reached max retry delay. ' + + 'this is a strong indication that ringpop is having ' + + 'trouble joining a cluster and could be due to a ' + + 'misconfiguration of your environment. ringpop will ' + + 'continue to join up to the max join duration.'; + self.ringpop.logger.error(errorMsg, { + local: self.ringpop.whoami(), + retriesSoFar: self.joinRetries, + joinDelayMax: delayMax, + maxJoinDuration: maxJoinDuration + }); + } - function reJoin() { - self.joinGroup(nodesJoined, onJoin); + self.ringpop.logger.info('ringpop joiner will attempt retry after delay', { + local: self.ringpop.whoami(), + retriesSoFar: self.joinRetries, + delay: self.joinDelay, + maxJoinDuration: maxJoinDuration, + timeJoiningSoFar: Date.now() - startTime + }); + // Attempt to retry the join after applying the delay backoff. + self.timers.setTimeout(function onTimeout() { + self.joinRetries++; + self.joinGroup(nodesJoined, onJoin); + }, self.joinDelay); } } this.joinGroup(nodesJoined, onJoin); }; -JoinCluster.prototype.joinGroup = function joinGroup(totalNodesJoined, callback) { +Joiner.prototype.joinGroup = function joinGroup(totalNodesJoined, callback) { var self = this; var group = this.selectGroup(totalNodesJoined); @@ -383,7 +370,7 @@ JoinCluster.prototype.joinGroup = function joinGroup(totalNodesJoined, callback) } }; -JoinCluster.prototype.joinNode = function joinNode(node, callback) { +Joiner.prototype.joinNode = function joinNode(node, callback) { var self = this; var joinOpts = { host: node, @@ -442,7 +429,7 @@ JoinCluster.prototype.joinNode = function joinNode(node, callback) { } }; -JoinCluster.prototype.selectGroup = function selectGroup(nodesJoined) { +Joiner.prototype.selectGroup = function selectGroup(nodesJoined) { nodesJoined = nodesJoined || []; var self = this; @@ -483,7 +470,7 @@ JoinCluster.prototype.selectGroup = function selectGroup(nodesJoined) { }; function createJoiner(opts) { - return new JoinCluster(opts); + return new Joiner(opts); } function joinCluster(opts, callback) { diff --git a/server/admin/member.js b/server/admin/member.js index 969815d1..19ac7f06 100644 --- a/server/admin/member.js +++ b/server/admin/member.js @@ -21,7 +21,7 @@ var errors = require('../../lib/errors.js'); var Member = require('../../lib/membership/member.js'); -var sendJoin = require('../../lib/gossip/join-sender.js').joinCluster; +var sendJoin = require('../../lib/gossip/joiner.js').joinCluster; var TypedError = require('error/typed'); var RedundantLeaveError = TypedError({ diff --git a/test/unit/join-sender-test.js b/test/unit/joiner_test.js similarity index 98% rename from test/unit/join-sender-test.js rename to test/unit/joiner_test.js index 751365fb..f605f022 100644 --- a/test/unit/join-sender-test.js +++ b/test/unit/joiner_test.js @@ -19,7 +19,7 @@ // THE SOFTWARE. 'use strict'; -var createJoiner = require('../../lib/gossip/join-sender.js').createJoiner; +var createJoiner = require('../../lib/gossip/joiner.js').createJoiner; var Ringpop = require('../../index.js'); var test = require('tape'); From 3e5f8a9426ba914b09eb61c48a991e783a4410e1 Mon Sep 17 00:00:00 2001 From: Jeff Wolski Date: Tue, 20 Oct 2015 05:31:55 +0200 Subject: [PATCH 2/6] Add fuzz to backoff; adapt tests to changes --- config.js | 1 + index.js | 3 +++ lib/gossip/joiner.js | 24 ++++++++++++------------ test/integration/join-test.js | 6 ++++-- test/unit/config_test.js | 4 ++-- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/config.js b/config.js index 63347d21..5ee6fb72 100644 --- a/config.js +++ b/config.js @@ -63,6 +63,7 @@ Config.prototype._seed = function _seed(seed) { var self = this; // All config names should be camel-cased. + seedOrDefault('TEST_KEY', 100, numValidator); // never remove, tests and lives depend on it seedOrDefault('TEST_KEY', 100); // never remove, tests and lives depend on it // Gossip configs diff --git a/index.js b/index.js index 37ad844d..f9e503dc 100644 --- a/index.js +++ b/index.js @@ -287,6 +287,7 @@ RingPop.prototype.bootstrap = function bootstrap(opts, callback) { joinTime = Date.now() - joinTime; if (err) { + self.stat('increment', 'join.failed.err'); self.logger.error('ringpop bootstrap failed', { error: err, address: self.hostPort @@ -296,6 +297,7 @@ RingPop.prototype.bootstrap = function bootstrap(opts, callback) { } if (self.destroyed) { + self.stat('increment', 'join.failed.destroyed'); var destroyedMsg = 'ringpop was destroyed ' + 'during bootstrap'; self.logger.error(destroyedMsg, { @@ -317,6 +319,7 @@ RingPop.prototype.bootstrap = function bootstrap(opts, callback) { bootstrapTime = Date.now() - bootstrapTime; + self.stat('increment', 'join.succeeded'); self.logger.debug('ringpop is ready', { address: self.hostPort, memberCount: self.membership.getMemberCount(), diff --git a/lib/gossip/joiner.js b/lib/gossip/joiner.js index 9993dd34..837af7aa 100644 --- a/lib/gossip/joiner.js +++ b/lib/gossip/joiner.js @@ -263,19 +263,13 @@ Joiner.prototype.join = function join(callback) { return; } - self.ringpop.logger.debug('ringpop join not yet complete', { - local: self.ringpop.whoami(), - joinSize: self.joinSize, - numJoined: numJoined, - numFailed: numFailed, - numNodesLeft: self.joinSize - numJoined - }); - var oldJoinDelay = self.joinDelay; var delayMax = self.ringpop.config.get('joinDelayMax'); var delayMin = self.ringpop.config.get('joinDelayMin'); - self.joinDelay = Math.min(delayMax, delayMin * Math.pow(2, - self.joinRetries)); + var newDelay = delayMin * Math.pow(2, self.joinRetries); + var withFuzz = Math.floor(Math.random() * + ((newDelay * 1.5) - newDelay)) + newDelay; + self.joinDelay = Math.min(delayMax, withFuzz); // Determine if the join delay has exceeded the maximum // delay and send out a warning letting developer's know @@ -295,12 +289,18 @@ Joiner.prototype.join = function join(callback) { }); } - self.ringpop.logger.info('ringpop joiner will attempt retry after delay', { + self.ringpop.logger.info('ringpop joiner not yet complete; will attempt retry after delay', { local: self.ringpop.whoami(), retriesSoFar: self.joinRetries, delay: self.joinDelay, maxJoinDuration: maxJoinDuration, - timeJoiningSoFar: Date.now() - startTime + timeJoiningSoFar: Date.now() - startTime, + delayWithoutFuzz: newDelay, + delayWithFuzz: self.joinDelay, + joinSize: self.joinSize, + numNodesJoined: numJoined, + numNodesFailed: numFailed, + numNodesLeft: self.joinSize - numJoined }); // Attempt to retry the join after applying the delay backoff. self.timers.setTimeout(function onTimeout() { diff --git a/test/integration/join-test.js b/test/integration/join-test.js index d3ea3806..8838f987 100644 --- a/test/integration/join-test.js +++ b/test/integration/join-test.js @@ -152,8 +152,10 @@ testRingpopCluster({ testRingpopCluster({ size: 3, tap: function tap(cluster) { - // This'll make Node 0's join fail faster - cluster[0].config.set('maxJoinAttempts', 1); + // Setting join config will make Node 0's join fail faster + cluster[0].config.set('joinDelayMax', 0); + cluster[0].config.set('joinDelayMin', 0); + cluster[0].config.set('maxJoinDuration', 1); cluster[1].config.set('memberBlacklist', [/127.0.0.1:10000/]); } }, 'join blacklist', function t(bootRes, cluster, assert) { diff --git a/test/unit/config_test.js b/test/unit/config_test.js index 5bf5fc5f..6ef6a13d 100644 --- a/test/unit/config_test.js +++ b/test/unit/config_test.js @@ -93,9 +93,9 @@ test('validates memberBlacklist seed', function t(assert) { test('validates num', function t(assert) { var config = new Config(null, { - 'maxJoinAttempts': 'notanum' + 'TEST_KEY': 'notanum' }); - assert.equals(50, config.get('maxJoinAttempts'), + assert.equals(100, config.get('TEST_KEY'), 'uses default'); assert.end(); }); From ba045db46433ef654b50e4eaa2d48179ed22401f Mon Sep 17 00:00:00 2001 From: Jeff Wolski Date: Tue, 20 Oct 2015 05:40:11 +0200 Subject: [PATCH 3/6] Make error logging configurable --- config.js | 1 + lib/gossip/joiner.js | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/config.js b/config.js index 5ee6fb72..82bb2712 100644 --- a/config.js +++ b/config.js @@ -87,6 +87,7 @@ Config.prototype._seed = function _seed(seed) { // Joiner config seedOrDefault('joinDelayMin', 100, numValidator); // ms seedOrDefault('joinDelayMax', 2 * 60 * 1000, numValidator); // 2 min in ms + seedOrDefault('joinTroubleErrorEnabled', true); seedOrDefault('maxJoinDuration', 20 * 60 * 1000, numValidator); // 20 mins in ms seedOrDefault('memberBlacklist', [], function validator(vals) { diff --git a/lib/gossip/joiner.js b/lib/gossip/joiner.js index 837af7aa..75e0e2f7 100644 --- a/lib/gossip/joiner.js +++ b/lib/gossip/joiner.js @@ -274,7 +274,8 @@ Joiner.prototype.join = function join(callback) { // Determine if the join delay has exceeded the maximum // delay and send out a warning letting developer's know // that Ringpop is having trouble. - if (oldJoinDelay < delayMax && + if (self.ringpop.config.get('joinTroubleErrorEnabled') && + oldJoinDelay < delayMax && self.joinDelay >= delayMax) { var errorMsg = 'ringpop joiner reached max retry delay. ' + 'this is a strong indication that ringpop is having ' + From 548d86fbf49f6185e8226573b8ab886c7dd2fec2 Mon Sep 17 00:00:00 2001 From: Jeff Wolski Date: Tue, 20 Oct 2015 09:30:40 +0200 Subject: [PATCH 4/6] Add a comment to delay computation --- lib/gossip/joiner.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/gossip/joiner.js b/lib/gossip/joiner.js index 75e0e2f7..110ca08b 100644 --- a/lib/gossip/joiner.js +++ b/lib/gossip/joiner.js @@ -263,6 +263,10 @@ Joiner.prototype.join = function join(callback) { return; } + // Compute a join delay that grows exponentially with + // each join retry that is performed. Add a randomized + // fuzz to the delay that is between 1x and 1.5x its + // value. var oldJoinDelay = self.joinDelay; var delayMax = self.ringpop.config.get('joinDelayMax'); var delayMin = self.ringpop.config.get('joinDelayMin'); From ff98092201e5cdd4dd65c0619a938f26894f52a8 Mon Sep 17 00:00:00 2001 From: Jeff Wolski Date: Tue, 20 Oct 2015 10:13:32 +0200 Subject: [PATCH 5/6] Ben's PR feedback; comment correction --- lib/gossip/joiner.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gossip/joiner.js b/lib/gossip/joiner.js index 110ca08b..c4cdf4f6 100644 --- a/lib/gossip/joiner.js +++ b/lib/gossip/joiner.js @@ -276,7 +276,7 @@ Joiner.prototype.join = function join(callback) { self.joinDelay = Math.min(delayMax, withFuzz); // Determine if the join delay has exceeded the maximum - // delay and send out a warning letting developer's know + // delay and send out a warning letting developers know // that Ringpop is having trouble. if (self.ringpop.config.get('joinTroubleErrorEnabled') && oldJoinDelay < delayMax && From d6c44e372fe5eeef9afe8cfcc891700f9462f71a Mon Sep 17 00:00:00 2001 From: Jeff Wolski Date: Tue, 20 Oct 2015 10:30:38 +0200 Subject: [PATCH 6/6] Bad merge --- config.js | 1 - 1 file changed, 1 deletion(-) diff --git a/config.js b/config.js index 82bb2712..30ca6f1d 100644 --- a/config.js +++ b/config.js @@ -64,7 +64,6 @@ Config.prototype._seed = function _seed(seed) { // All config names should be camel-cased. seedOrDefault('TEST_KEY', 100, numValidator); // never remove, tests and lives depend on it - seedOrDefault('TEST_KEY', 100); // never remove, tests and lives depend on it // Gossip configs seedOrDefault('autoGossip', true);