diff --git a/config.js b/config.js index 8ff5ae0a..30ca6f1d 100644 --- a/config.js +++ b/config.js @@ -63,7 +63,7 @@ Config.prototype._seed = function _seed(seed) { var self = this; // All config names should be camel-cased. - seedOrDefault('TEST_KEY', 100); // never remove, tests and lives depend on it + seedOrDefault('TEST_KEY', 100, numValidator); // never remove, tests and lives depend on it // Gossip configs seedOrDefault('autoGossip', true); @@ -82,7 +82,13 @@ Config.prototype._seed = function _seed(seed) { seedOrDefault('dampScoringReuseLimit', 2500); seedOrDefault('dampScoringSuppressDuration', 60 * 60 * 1000); // 1 hr in ms seedOrDefault('dampScoringSuppressLimit', 5000); - seedOrDefault('maxJoinAttempts', 50, numValidator); + + // Joiner config + seedOrDefault('joinDelayMin', 100, numValidator); // ms + seedOrDefault('joinDelayMax', 2 * 60 * 1000, numValidator); // 2 min in ms + seedOrDefault('joinTroubleErrorEnabled', true); + seedOrDefault('maxJoinDuration', 20 * 60 * 1000, numValidator); // 20 mins in ms + seedOrDefault('memberBlacklist', [], function validator(vals) { return _.all(vals, function all(val) { return val instanceof RegExp; diff --git a/index.js b/index.js index 366fc120..f9e503dc 100644 --- a/index.js +++ b/index.js @@ -61,11 +61,10 @@ var registerRingpopListeners = require('./lib/on_ringpop_event.js').register; var RingpopClient = require('./client.js'); var RingpopServer = require('./server'); var safeParse = require('./lib/util').safeParse; -var sendJoin = require('./lib/gossip/join-sender.js').joinCluster; +var sendJoin = require('./lib/gossip/joiner.js').joinCluster; var TracerStore = require('./lib/trace/store.js'); var HOST_PORT_PATTERN = /^(\d+.\d+.\d+.\d+):\d+$/; -var MAX_JOIN_DURATION = 300000; var MEMBERSHIP_UPDATE_FLUSH_INTERVAL = 5000; function RingPop(options) { @@ -115,7 +114,6 @@ function RingPop(options) { this.pingTimeout = options.pingTimeout || 1500; this.joinTimeout = options.joinTimeout || 1000; this.proxyReqTimeout = options.proxyReqTimeout || 30000; - this.maxJoinDuration = options.maxJoinDuration || MAX_JOIN_DURATION; this.membershipUpdateFlushInterval = options.membershipUpdateFlushInterval || MEMBERSHIP_UPDATE_FLUSH_INTERVAL; @@ -282,7 +280,6 @@ RingPop.prototype.bootstrap = function bootstrap(opts, callback) { sendJoin({ ringpop: self, - maxJoinDuration: self.maxJoinDuration, joinSize: self.joinSize, parallelismFactor: opts.joinParallelismFactor, joinTimeout: self.joinTimeout @@ -290,6 +287,7 @@ RingPop.prototype.bootstrap = function bootstrap(opts, callback) { joinTime = Date.now() - joinTime; if (err) { + self.stat('increment', 'join.failed.err'); self.logger.error('ringpop bootstrap failed', { error: err, address: self.hostPort @@ -299,6 +297,7 @@ RingPop.prototype.bootstrap = function bootstrap(opts, callback) { } if (self.destroyed) { + self.stat('increment', 'join.failed.destroyed'); var destroyedMsg = 'ringpop was destroyed ' + 'during bootstrap'; self.logger.error(destroyedMsg, { @@ -320,6 +319,7 @@ RingPop.prototype.bootstrap = function bootstrap(opts, callback) { bootstrapTime = Date.now() - bootstrapTime; + self.stat('increment', 'join.succeeded'); self.logger.debug('ringpop is ready', { address: self.hostPort, memberCount: self.membership.getMemberCount(), diff --git a/lib/gossip/join-sender.js b/lib/gossip/joiner.js similarity index 80% rename from lib/gossip/join-sender.js rename to lib/gossip/joiner.js index d9389945..c4cdf4f6 100644 --- a/lib/gossip/join-sender.js +++ b/lib/gossip/joiner.js @@ -21,6 +21,7 @@ var captureHost = require('../util.js').captureHost; var errors = require('../errors.js'); +var globalTimers = require('timers'); var isEmptyArray = require('../util.js').isEmptyArray; var mergeJoinResponses = require('./join-response-merge.js'); var numOrDefault = require('../util.js').numOrDefault; @@ -40,29 +41,8 @@ var JoinDurationExceededError = TypedError({ max: null }); -var JoinAttemptsExceededError = TypedError({ - type: 'ringpop.join-attempts-exceeded', - message: 'Join attempts of `{joinAttempts}` ' + - 'exceeded max `{maxJoinAttempts}`.\n', - joinAttempts: null, - maxJoinAttempts: null -}); - -var JOIN_RETRY_DELAY = 100; var JOIN_SIZE = 3; var JOIN_TIMEOUT = 1000; -// If a node cannot complete a join within MAX_JOIN_DURATION -// there is likely something very wrong. The aim is for the join -// operation to take no more than 1s, under normal conditions. -// -// The duration assigned below is very high for the following -// purposes: -// - Gives an application developer some time to diagnose -// what could be wrong. -// - Gives an operator some time to bootstrap a newly -// provisioned cluster -// - Trying forever is futile -var MAX_JOIN_DURATION = 120000; var PARALLELISM_FACTOR = 2; function isSingleNodeCluster(ringpop) { @@ -79,7 +59,7 @@ function takeNode(hosts) { return host; } -function JoinCluster(opts) { +function Joiner(opts) { opts = opts || {}; if (!opts.ringpop) { @@ -97,6 +77,7 @@ function JoinCluster(opts) { } this.ringpop = opts.ringpop; + this.timers = opts.timers || globalTimers; this.host = captureHost(this.ringpop.hostPort); this.joinTimeout = numOrDefault(opts.joinTimeout, JOIN_TIMEOUT); @@ -107,18 +88,6 @@ function JoinCluster(opts) { this.parallelismFactor = numOrDefault(opts.parallelismFactor, PARALLELISM_FACTOR); - // We eventually want to give up if the join process cannot - // succeed. `maxJoinDuration` is used to restrict that process - // to a certain time limit. - this.maxJoinDuration = numOrDefault(opts.maxJoinDuration, - MAX_JOIN_DURATION); - - // We do not want to retry joining as hard as we can. We - // want to have some fixed backoff applied before we try - // to join again - this.joinRetryDelay = numOrDefault(opts.joinRetryDelay, - JOIN_RETRY_DELAY); - // Potential nodes are nodes in the ringpop bootstrap // list that can be joined. Upon instantiation, this step // simply filters out a node from attempting to join itself. @@ -144,10 +113,13 @@ function JoinCluster(opts) { // Changes received by other nodes will be aggregated and // applied once the join process is complete. this.joinResponses = []; + + this.joinDelay = this.ringpop.config.get('joinDelayMin'); + this.joinRetries = 0; } // Potential nodes are those that are not this instance of ringpop. -JoinCluster.prototype.collectPotentialNodes = function collectPotentialNodes(nodesJoined) { +Joiner.prototype.collectPotentialNodes = function collectPotentialNodes(nodesJoined) { nodesJoined = nodesJoined || []; var self = this; @@ -157,7 +129,7 @@ JoinCluster.prototype.collectPotentialNodes = function collectPotentialNodes(nod }; // Preferred nodes are those that are not on the same host as this instance of ringpop. -JoinCluster.prototype.collectPreferredNodes = function collectPreferredNodes() { +Joiner.prototype.collectPreferredNodes = function collectPreferredNodes() { var self = this; return this.potentialNodes.filter(function filterHost(hostPort) { return self.host !== captureHost(hostPort); @@ -165,7 +137,7 @@ JoinCluster.prototype.collectPreferredNodes = function collectPreferredNodes() { }; // Non-preferred nodes are everyone else. -JoinCluster.prototype.collectNonPreferredNodes = function collectNonPreferredNodes() { +Joiner.prototype.collectNonPreferredNodes = function collectNonPreferredNodes() { var self = this; if (isEmptyArray(this.preferredNodes)) { @@ -177,7 +149,7 @@ JoinCluster.prototype.collectNonPreferredNodes = function collectNonPreferredNod } }; -JoinCluster.prototype.init = function init(nodesJoined) { +Joiner.prototype.init = function init(nodesJoined) { // TODO The "collect" operations are fairly inefficient. This // can be improved by indexing by host/port values. this.potentialNodes = this.collectPotentialNodes(nodesJoined); @@ -191,7 +163,7 @@ JoinCluster.prototype.init = function init(nodesJoined) { this.roundNonPreferredNodes = this.nonPreferredNodes.slice(0); }; -JoinCluster.prototype.join = function join(callback) { +Joiner.prototype.join = function join(callback) { var self = this; if (this.ringpop.destroyed) { @@ -221,7 +193,6 @@ JoinCluster.prototype.join = function join(callback) { var numFailed = 0; var startTime = Date.now(); var calledBack = false; - var maxJoinAttempts = this.ringpop.config.get('maxJoinAttempts'); function onJoin(err, nodes) { if (calledBack) { @@ -271,29 +242,14 @@ JoinCluster.prototype.join = function join(callback) { calledBack = true; callback(null, nodesJoined); - } else if (numFailed >= maxJoinAttempts) { - self.ringpop.logger.warn('ringpop max join attempts exceeded', { - local: self.ringpop.whoami(), - joinAttempts: numFailed, - maxJoinAttempts: maxJoinAttempts, - numJoined: numJoined, - numFailed: numFailed, - startTime: startTime - }); - - calledBack = true; - callback(JoinAttemptsExceededError({ - joinAttempts: numFailed, - maxJoinAttempts: maxJoinAttempts - })); - return; } else { var joinDuration = Date.now() - startTime; - if (joinDuration > self.maxJoinDuration) { + var maxJoinDuration = self.ringpop.config.get('maxJoinDuration'); + if (joinDuration > maxJoinDuration) { self.ringpop.logger.warn('ringpop max join duration exceeded', { local: self.ringpop.whoami(), joinDuration: joinDuration, - maxJoinDuration: self.maxJoinDuration, + maxJoinDuration: maxJoinDuration, numJoined: numJoined, numFailed: numFailed, startTime: startTime @@ -302,31 +258,67 @@ JoinCluster.prototype.join = function join(callback) { calledBack = true; callback(JoinDurationExceededError({ joinDuration: joinDuration, - maxJoinDuration: self.maxJoinDuration + maxJoinDuration: maxJoinDuration })); return; } - self.ringpop.logger.debug('ringpop join not yet complete', { + // Compute a join delay that grows exponentially with + // each join retry that is performed. Add a randomized + // fuzz to the delay that is between 1x and 1.5x its + // value. + var oldJoinDelay = self.joinDelay; + var delayMax = self.ringpop.config.get('joinDelayMax'); + var delayMin = self.ringpop.config.get('joinDelayMin'); + var newDelay = delayMin * Math.pow(2, self.joinRetries); + var withFuzz = Math.floor(Math.random() * + ((newDelay * 1.5) - newDelay)) + newDelay; + self.joinDelay = Math.min(delayMax, withFuzz); + + // Determine if the join delay has exceeded the maximum + // delay and send out a warning letting developers know + // that Ringpop is having trouble. + if (self.ringpop.config.get('joinTroubleErrorEnabled') && + oldJoinDelay < delayMax && + self.joinDelay >= delayMax) { + var errorMsg = 'ringpop joiner reached max retry delay. ' + + 'this is a strong indication that ringpop is having ' + + 'trouble joining a cluster and could be due to a ' + + 'misconfiguration of your environment. ringpop will ' + + 'continue to join up to the max join duration.'; + self.ringpop.logger.error(errorMsg, { + local: self.ringpop.whoami(), + retriesSoFar: self.joinRetries, + joinDelayMax: delayMax, + maxJoinDuration: maxJoinDuration + }); + } + + self.ringpop.logger.info('ringpop joiner not yet complete; will attempt retry after delay', { local: self.ringpop.whoami(), + retriesSoFar: self.joinRetries, + delay: self.joinDelay, + maxJoinDuration: maxJoinDuration, + timeJoiningSoFar: Date.now() - startTime, + delayWithoutFuzz: newDelay, + delayWithFuzz: self.joinDelay, joinSize: self.joinSize, - numJoined: numJoined, - numFailed: numFailed, + numNodesJoined: numJoined, + numNodesFailed: numFailed, numNodesLeft: self.joinSize - numJoined }); - - setTimeout(reJoin, self.joinRetryDelay); - } - - function reJoin() { - self.joinGroup(nodesJoined, onJoin); + // Attempt to retry the join after applying the delay backoff. + self.timers.setTimeout(function onTimeout() { + self.joinRetries++; + self.joinGroup(nodesJoined, onJoin); + }, self.joinDelay); } } this.joinGroup(nodesJoined, onJoin); }; -JoinCluster.prototype.joinGroup = function joinGroup(totalNodesJoined, callback) { +Joiner.prototype.joinGroup = function joinGroup(totalNodesJoined, callback) { var self = this; var group = this.selectGroup(totalNodesJoined); @@ -383,7 +375,7 @@ JoinCluster.prototype.joinGroup = function joinGroup(totalNodesJoined, callback) } }; -JoinCluster.prototype.joinNode = function joinNode(node, callback) { +Joiner.prototype.joinNode = function joinNode(node, callback) { var self = this; var joinOpts = { host: node, @@ -442,7 +434,7 @@ JoinCluster.prototype.joinNode = function joinNode(node, callback) { } }; -JoinCluster.prototype.selectGroup = function selectGroup(nodesJoined) { +Joiner.prototype.selectGroup = function selectGroup(nodesJoined) { nodesJoined = nodesJoined || []; var self = this; @@ -483,7 +475,7 @@ JoinCluster.prototype.selectGroup = function selectGroup(nodesJoined) { }; function createJoiner(opts) { - return new JoinCluster(opts); + return new Joiner(opts); } function joinCluster(opts, callback) { diff --git a/server/admin/member.js b/server/admin/member.js index 969815d1..19ac7f06 100644 --- a/server/admin/member.js +++ b/server/admin/member.js @@ -21,7 +21,7 @@ var errors = require('../../lib/errors.js'); var Member = require('../../lib/membership/member.js'); -var sendJoin = require('../../lib/gossip/join-sender.js').joinCluster; +var sendJoin = require('../../lib/gossip/joiner.js').joinCluster; var TypedError = require('error/typed'); var RedundantLeaveError = TypedError({ diff --git a/test/integration/join-test.js b/test/integration/join-test.js index d3ea3806..8838f987 100644 --- a/test/integration/join-test.js +++ b/test/integration/join-test.js @@ -152,8 +152,10 @@ testRingpopCluster({ testRingpopCluster({ size: 3, tap: function tap(cluster) { - // This'll make Node 0's join fail faster - cluster[0].config.set('maxJoinAttempts', 1); + // Setting join config will make Node 0's join fail faster + cluster[0].config.set('joinDelayMax', 0); + cluster[0].config.set('joinDelayMin', 0); + cluster[0].config.set('maxJoinDuration', 1); cluster[1].config.set('memberBlacklist', [/127.0.0.1:10000/]); } }, 'join blacklist', function t(bootRes, cluster, assert) { diff --git a/test/unit/config_test.js b/test/unit/config_test.js index 5bf5fc5f..6ef6a13d 100644 --- a/test/unit/config_test.js +++ b/test/unit/config_test.js @@ -93,9 +93,9 @@ test('validates memberBlacklist seed', function t(assert) { test('validates num', function t(assert) { var config = new Config(null, { - 'maxJoinAttempts': 'notanum' + 'TEST_KEY': 'notanum' }); - assert.equals(50, config.get('maxJoinAttempts'), + assert.equals(100, config.get('TEST_KEY'), 'uses default'); assert.end(); }); diff --git a/test/unit/join-sender-test.js b/test/unit/joiner_test.js similarity index 98% rename from test/unit/join-sender-test.js rename to test/unit/joiner_test.js index 751365fb..f605f022 100644 --- a/test/unit/join-sender-test.js +++ b/test/unit/joiner_test.js @@ -19,7 +19,7 @@ // THE SOFTWARE. 'use strict'; -var createJoiner = require('../../lib/gossip/join-sender.js').createJoiner; +var createJoiner = require('../../lib/gossip/joiner.js').createJoiner; var Ringpop = require('../../index.js'); var test = require('tape');