Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Master election should demotes nodes which try to join the cluster for the first time #7558

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 48 additions & 14 deletions src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import org.elasticsearch.discovery.zen.fd.MasterFaultDetection;
import org.elasticsearch.discovery.zen.fd.NodesFaultDetection;
import org.elasticsearch.discovery.zen.membership.MembershipAction;
import org.elasticsearch.discovery.zen.ping.PingContextProvider;
import org.elasticsearch.discovery.zen.ping.ZenPing;
import org.elasticsearch.discovery.zen.ping.ZenPingService;
import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction;
Expand All @@ -69,14 +70,15 @@
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import static com.google.common.collect.Lists.newArrayList;
import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;

/**
*
*/
public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, DiscoveryNodesProvider {
public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, PingContextProvider {

public final static String SETTING_REJOIN_ON_MASTER_GONE = "discovery.zen.rejoin_on_master_gone";
public final static String SETTING_PING_TIMEOUT = "discovery.zen.ping.timeout";
Expand Down Expand Up @@ -139,6 +141,9 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen

private volatile boolean rejoinOnMasterGone;

/** counts the time this node has joined the cluster or have elected it self as master */
private final AtomicLong clusterJoinsCounter = new AtomicLong();

@Nullable
private NodeService nodeService;

Expand Down Expand Up @@ -194,7 +199,7 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
this.nodesFD.addListener(new NodeFaultDetectionListener());

this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings, clusterName);
this.pingService.setNodesProvider(this);
this.pingService.setPingContextProvider(this);
this.membership = new MembershipAction(settings, clusterService, transportService, this, new MembershipListener());

transportService.registerHandler(DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequestHandler());
Expand Down Expand Up @@ -290,6 +295,7 @@ public String nodeDescription() {
return clusterName.value() + "/" + localNode.id();
}

/** start of {@link org.elasticsearch.discovery.zen.ping.PingContextProvider } implementation */
@Override
public DiscoveryNodes nodes() {
DiscoveryNodes latestNodes = this.latestDiscoNodes;
Expand All @@ -305,6 +311,14 @@ public NodeService nodeService() {
return this.nodeService;
}

@Override
public boolean nodeHasJoinedClusterOnce() {
return clusterJoinsCounter.get() > 0;
}

/** end of {@link org.elasticsearch.discovery.zen.ping.PingContextProvider } implementation */


@Override
public void publish(ClusterState clusterState, AckListener ackListener) {
if (!master) {
Expand Down Expand Up @@ -387,6 +401,8 @@ public void onFailure(String source, Throwable t) {
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
sendInitialStateEventIfNeeded();
long count = clusterJoinsCounter.incrementAndGet();
logger.trace("cluster joins counter set to [{}] (elected as master)", count);
}
});
} else {
Expand All @@ -404,8 +420,8 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
}

masterFD.start(masterNode, "initial_join");
// no need to submit the received cluster state, we will get it from the master when it publishes
// the fact that we joined
long count = clusterJoinsCounter.incrementAndGet();
logger.trace("cluster joins counter set to [{}] (joined master)", count);
}
}
}
Expand Down Expand Up @@ -922,7 +938,7 @@ private DiscoveryNode findMaster() {
sb.append(" {none}");
} else {
for (ZenPing.PingResponse pingResponse : fullPingResponses) {
sb.append("\n\t--> ").append("target [").append(pingResponse.target()).append("], master [").append(pingResponse.master()).append("]");
sb.append("\n\t--> ").append(pingResponse);
}
}
logger.trace(sb.toString());
Expand All @@ -931,7 +947,7 @@ private DiscoveryNode findMaster() {
// filter responses
List<ZenPing.PingResponse> pingResponses = Lists.newArrayList();
for (ZenPing.PingResponse pingResponse : fullPingResponses) {
DiscoveryNode node = pingResponse.target();
DiscoveryNode node = pingResponse.node();
if (masterElectionFilterClientNodes && (node.clientNode() || (!node.masterNode() && !node.dataNode()))) {
// filter out the client node, which is a client node, or also one that is not data and not master (effectively, client)
} else if (masterElectionFilterDataNodes && (!node.masterNode() && node.dataNode())) {
Expand All @@ -947,7 +963,7 @@ private DiscoveryNode findMaster() {
sb.append(" {none}");
} else {
for (ZenPing.PingResponse pingResponse : pingResponses) {
sb.append("\n\t--> ").append("target [").append(pingResponse.target()).append("], master [").append(pingResponse.master()).append("]");
sb.append("\n\t--> ").append(pingResponse);
}
}
logger.debug(sb.toString());
Expand All @@ -963,20 +979,38 @@ private DiscoveryNode findMaster() {
}
}

Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet();
// nodes discovered during pinging
Set<DiscoveryNode> activeNodes = Sets.newHashSet();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if these collections are used for iteration I'd use something that has a stable iteration order instead of hashSet which might be different due to object identity and rehashing etc. I butt we should force this on the interface in electMaster further down the road

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure -

  • electMaster doesn't care about order -it sorts the list anyway
  • the set dedups the results (though I don't think that's needed)
  • DiscoveryNodes implements equals and hashcode - so no object pointers are used?

Before changing I'll have to double check the dedup logic is not needed. I'm not sure it's worth the risk - or do I miss something?

// nodes discovered who has previously been part of the cluster and do not ping for the very first time
Set<DiscoveryNode> joinedOnceActiveNodes = Sets.newHashSet();
if (localNode.masterNode()) {
possibleMasterNodes.add(localNode);
activeNodes.add(localNode);
long joinsCounter = clusterJoinsCounter.get();
if (joinsCounter > 0) {
logger.trace("adding local node to the list of active nodes who has previously joined the cluster (joins counter is [{}})", joinsCounter);
joinedOnceActiveNodes.add(localNode);
}
}
for (ZenPing.PingResponse pingResponse : pingResponses) {
possibleMasterNodes.add(pingResponse.target());
activeNodes.add(pingResponse.node());
if (pingResponse.hasJoinedOnce()) {
joinedOnceActiveNodes.add(pingResponse.node());
}
}

if (pingMasters.isEmpty()) {
// if we don't have enough master nodes, we bail, because there are not enough master to elect from
if (electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
return electMaster.electMaster(possibleMasterNodes);
if (electMaster.hasEnoughMasterNodes(activeNodes)) {
// we give preference to nodes who have previously already joined the cluster. Those will
// have a cluster state in memory, including an up to date routing table (which is not persistent to disk
// by the gateway)
DiscoveryNode master = electMaster.electMaster(joinedOnceActiveNodes);
if (master != null) {
return master;
}
return electMaster.electMaster(activeNodes);
} else {
logger.trace("not enough master nodes [{}]", possibleMasterNodes);
// if we don't have enough master nodes, we bail, because there are not enough master to elect from
logger.trace("not enough master nodes [{}]", activeNodes);
return null;
}
} else {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.discovery.zen.ping;

import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;

/**
*
*/
public interface PingContextProvider extends DiscoveryNodesProvider {

/** return true if this node has previously joined the cluster at least once. False if this is first join */
boolean nodeHasJoinedClusterOnce();

}
51 changes: 40 additions & 11 deletions src/main/java/org/elasticsearch/discovery/zen/ping/ZenPing.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@
package org.elasticsearch.discovery.zen.ping;

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.component.LifecycleComponent;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Streamable;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;

import java.io.IOException;

Expand All @@ -39,7 +39,7 @@
*/
public interface ZenPing extends LifecycleComponent<ZenPing> {

void setNodesProvider(DiscoveryNodesProvider nodesProvider);
void setPingContextProvider(PingContextProvider contextProvider);

void ping(PingListener listener, TimeValue timeout) throws ElasticsearchException;

Expand All @@ -49,36 +49,52 @@ public interface PingListener {
}

public static class PingResponse implements Streamable {

public static final PingResponse[] EMPTY = new PingResponse[0];

private ClusterName clusterName;

private DiscoveryNode target;
private DiscoveryNode node;

private DiscoveryNode master;

private boolean hasJoinedOnce;

private PingResponse() {
}

public PingResponse(DiscoveryNode target, DiscoveryNode master, ClusterName clusterName) {
this.target = target;
/**
* @param node the node which this ping describes
* @param master the current master of the node
* @param clusterName the cluster name of the node
* @param hasJoinedOnce true if the joined has successfully joined the cluster before
*/
public PingResponse(DiscoveryNode node, DiscoveryNode master, ClusterName clusterName, boolean hasJoinedOnce) {
this.node = node;
this.master = master;
this.clusterName = clusterName;
this.hasJoinedOnce = hasJoinedOnce;
}

public ClusterName clusterName() {
return this.clusterName;
}

public DiscoveryNode target() {
return target;
/** the node which this ping describes */
public DiscoveryNode node() {
return node;
}

/** the current master of the node */
public DiscoveryNode master() {
return master;
}

/** true if the joined has successfully joined the cluster before */
public boolean hasJoinedOnce() {
return hasJoinedOnce;
}

public static PingResponse readPingResponse(StreamInput in) throws IOException {
PingResponse response = new PingResponse();
response.readFrom(in);
Expand All @@ -88,27 +104,40 @@ public static PingResponse readPingResponse(StreamInput in) throws IOException {
@Override
public void readFrom(StreamInput in) throws IOException {
clusterName = readClusterName(in);
target = readNode(in);
node = readNode(in);
if (in.readBoolean()) {
master = readNode(in);
}
if (in.getVersion().onOrAfter(Version.V_1_4_0)) {
this.hasJoinedOnce = in.readBoolean();
} else {
// As of 1.4.0 we prefer to elect nodes which have previously successfully joined the cluster.
// Nodes before 1.4.0 do not take this into consideration. If pre<1.4.0 node elects it self as master
// based on the pings, we need to make sure we do the same. We therefore can not demote it
// and thus mark it as if it has previously joined.
this.hasJoinedOnce = true;
}

}

@Override
public void writeTo(StreamOutput out) throws IOException {
clusterName.writeTo(out);
target.writeTo(out);
node.writeTo(out);
if (master == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
master.writeTo(out);
}
if (out.getVersion().onOrAfter(Version.V_1_4_0)) {
out.writeBoolean(hasJoinedOnce);
}
}

@Override
public String toString() {
return "ping_response{target [" + target + "], master [" + master + "], cluster_name[" + clusterName.value() + "]}";
return "ping_response{node [" + node + "], master [" + master + "], hasJoinedOnce [" + hasJoinedOnce + "], cluster_name[" + clusterName.value() + "]}";
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.discovery.zen.ping.multicast.MulticastZenPing;
import org.elasticsearch.discovery.zen.ping.unicast.UnicastHostsProvider;
Expand Down Expand Up @@ -92,12 +91,12 @@ public void zenPings(ImmutableList<? extends ZenPing> pings) {
}

@Override
public void setNodesProvider(DiscoveryNodesProvider nodesProvider) {
public void setPingContextProvider(PingContextProvider contextProvider) {
if (lifecycle.started()) {
throw new ElasticsearchIllegalStateException("Can't set nodes provider when started");
}
for (ZenPing zenPing : zenPings) {
zenPing.setNodesProvider(nodesProvider);
zenPing.setPingContextProvider(contextProvider);
}
}

Expand Down Expand Up @@ -172,7 +171,7 @@ private CompoundPingListener(PingListener listener, ImmutableList<? extends ZenP
public void onPing(PingResponse[] pings) {
if (pings != null) {
for (PingResponse pingResponse : pings) {
responses.put(pingResponse.target(), pingResponse);
responses.put(pingResponse.node(), pingResponse);
}
}
if (counter.decrementAndGet() == 0) {
Expand Down
Loading