diff --git a/server/src/main/java/org/opensearch/action/bulk/TransportShardBulkAction.java b/server/src/main/java/org/opensearch/action/bulk/TransportShardBulkAction.java index 56fb688290002..59f9042ec4a85 100644 --- a/server/src/main/java/org/opensearch/action/bulk/TransportShardBulkAction.java +++ b/server/src/main/java/org/opensearch/action/bulk/TransportShardBulkAction.java @@ -94,6 +94,8 @@ import java.util.function.Function; import java.util.function.LongSupplier; +import org.opensearch.action.support.replication.ReplicationMode; + /** * Performs shard-level bulk (index, delete or update) operations * @@ -193,6 +195,14 @@ protected long primaryOperationSize(BulkShardRequest request) { return request.ramBytesUsed(); } + @Override + protected ReplicationMode getReplicationMode(IndexShard indexShard) { + if (indexShard.isRemoteTranslogEnabled()) { + return ReplicationMode.PRIMARY_TERM_VALIDATION; + } + return super.getReplicationMode(indexShard); + } + public static void performOnPrimary( BulkShardRequest request, IndexShard primary, diff --git a/server/src/main/java/org/opensearch/action/support/replication/FanoutReplicationProxy.java b/server/src/main/java/org/opensearch/action/support/replication/FanoutReplicationProxy.java new file mode 100644 index 0000000000000..2980df4c1c0af --- /dev/null +++ b/server/src/main/java/org/opensearch/action/support/replication/FanoutReplicationProxy.java @@ -0,0 +1,25 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.action.support.replication; + +import org.opensearch.cluster.routing.ShardRouting; + +/** + * This implementation of {@link ReplicationProxy} fans out the replication request to current shard routing if + * it is not the primary and has replication mode as {@link ReplicationMode#FULL_REPLICATION}. + * + * @opensearch.internal + */ +public class FanoutReplicationProxy extends ReplicationProxy { + + @Override + ReplicationMode determineReplicationMode(ShardRouting shardRouting, ShardRouting primaryRouting) { + return shardRouting.isSameAllocation(primaryRouting) == false ? ReplicationMode.FULL_REPLICATION : ReplicationMode.NO_REPLICATION; + } +} diff --git a/server/src/main/java/org/opensearch/action/support/replication/ReplicationMode.java b/server/src/main/java/org/opensearch/action/support/replication/ReplicationMode.java new file mode 100644 index 0000000000000..f9b85cc4bd7aa --- /dev/null +++ b/server/src/main/java/org/opensearch/action/support/replication/ReplicationMode.java @@ -0,0 +1,32 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.action.support.replication; + +/** + * The type of replication used for inter-node replication. + * + * @opensearch.internal + */ +public enum ReplicationMode { + /** + * In this mode, a {@code TransportReplicationAction} is fanned out to underlying concerned shard and is replicated logically. + * In short, this mode would replicate the {@link ReplicationRequest} to + * the replica shard along with primary term validation. + */ + FULL_REPLICATION, + /** + * In this mode, a {@code TransportReplicationAction} is fanned out to underlying concerned shard and used for + * primary term validation only. The request is not replicated logically. + */ + PRIMARY_TERM_VALIDATION, + /** + * In this mode, a {@code TransportReplicationAction} does not fan out to the underlying concerned shard. + */ + NO_REPLICATION; +} diff --git a/server/src/main/java/org/opensearch/action/support/replication/ReplicationModeAwareProxy.java b/server/src/main/java/org/opensearch/action/support/replication/ReplicationModeAwareProxy.java new file mode 100644 index 0000000000000..fa28e99d5696f --- /dev/null +++ b/server/src/main/java/org/opensearch/action/support/replication/ReplicationModeAwareProxy.java @@ -0,0 +1,44 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.action.support.replication; + +import org.opensearch.cluster.routing.ShardRouting; + +import java.util.Objects; + +/** + * This implementation of {@link ReplicationProxy} fans out the replication request to current shard routing basis + * the shard routing's replication mode and replication override policy. + * + * @opensearch.internal + */ +public class ReplicationModeAwareProxy extends ReplicationProxy { + + private final ReplicationMode replicationModeOverride; + + public ReplicationModeAwareProxy(ReplicationMode replicationModeOverride) { + assert Objects.nonNull(replicationModeOverride); + this.replicationModeOverride = replicationModeOverride; + } + + @Override + ReplicationMode determineReplicationMode(ShardRouting shardRouting, ShardRouting primaryRouting) { + + // If the current routing is the primary, then it does not need to be replicated + if (shardRouting.isSameAllocation(primaryRouting)) { + return ReplicationMode.NO_REPLICATION; + } + + if (primaryRouting.relocating() && shardRouting.isSameAllocation(primaryRouting.getTargetRelocatingShard())) { + return ReplicationMode.FULL_REPLICATION; + } + + return replicationModeOverride; + } +} diff --git a/server/src/main/java/org/opensearch/action/support/replication/ReplicationOperation.java b/server/src/main/java/org/opensearch/action/support/replication/ReplicationOperation.java index da37eee88a4e0..1a6a5a9245eb2 100644 --- a/server/src/main/java/org/opensearch/action/support/replication/ReplicationOperation.java +++ b/server/src/main/java/org/opensearch/action/support/replication/ReplicationOperation.java @@ -35,13 +35,14 @@ import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.store.AlreadyClosedException; import org.opensearch.Assertions; -import org.opensearch.OpenSearchException; import org.opensearch.ExceptionsHelper; +import org.opensearch.OpenSearchException; import org.opensearch.action.ActionListener; import org.opensearch.action.UnavailableShardsException; import org.opensearch.action.support.ActiveShardCount; import org.opensearch.action.support.RetryableAction; import org.opensearch.action.support.TransportActions; +import org.opensearch.action.support.replication.ReplicationProxyRequest.Builder; import org.opensearch.cluster.action.shard.ShardStateAction; import org.opensearch.cluster.routing.IndexShardRoutingTable; import org.opensearch.cluster.routing.ShardRouting; @@ -99,6 +100,7 @@ public class ReplicationOperation< private final TimeValue initialRetryBackoffBound; private final TimeValue retryTimeout; private final long primaryTerm; + private final ReplicationProxy replicationProxy; // exposed for tests private final ActionListener resultListener; @@ -117,7 +119,8 @@ public ReplicationOperation( String opType, long primaryTerm, TimeValue initialRetryBackoffBound, - TimeValue retryTimeout + TimeValue retryTimeout, + ReplicationProxy replicationProxy ) { this.replicasProxy = replicas; this.primary = primary; @@ -129,6 +132,7 @@ public ReplicationOperation( this.primaryTerm = primaryTerm; this.initialRetryBackoffBound = initialRetryBackoffBound; this.retryTimeout = retryTimeout; + this.replicationProxy = replicationProxy; } public void execute() throws Exception { @@ -226,20 +230,26 @@ private void performOnReplicas( final ShardRouting primaryRouting = primary.routingEntry(); - for (final ShardRouting shard : replicationGroup.getReplicationTargets()) { - if (shard.isSameAllocation(primaryRouting) == false) { - performOnReplica(shard, replicaRequest, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, pendingReplicationActions); - } + for (final ShardRouting shardRouting : replicationGroup.getReplicationTargets()) { + ReplicationProxyRequest proxyRequest = new Builder( + shardRouting, + primaryRouting, + globalCheckpoint, + maxSeqNoOfUpdatesOrDeletes, + pendingReplicationActions, + replicaRequest + ).build(); + replicationProxy.performOnReplicaProxy(proxyRequest, this::performOnReplica); } } - private void performOnReplica( - final ShardRouting shard, - final ReplicaRequest replicaRequest, - final long globalCheckpoint, - final long maxSeqNoOfUpdatesOrDeletes, - final PendingReplicationActions pendingReplicationActions - ) { + private void performOnReplica(final ReplicationProxyRequest replicationProxyRequest) { + final ShardRouting shard = replicationProxyRequest.getShardRouting(); + final ReplicaRequest replicaRequest = replicationProxyRequest.getReplicaRequest(); + final long globalCheckpoint = replicationProxyRequest.getGlobalCheckpoint(); + final long maxSeqNoOfUpdatesOrDeletes = replicationProxyRequest.getMaxSeqNoOfUpdatesOrDeletes(); + final PendingReplicationActions pendingReplicationActions = replicationProxyRequest.getPendingReplicationActions(); + if (logger.isTraceEnabled()) { logger.trace("[{}] sending op [{}] to replica {} for request [{}]", shard.shardId(), opType, shard, replicaRequest); } diff --git a/server/src/main/java/org/opensearch/action/support/replication/ReplicationProxy.java b/server/src/main/java/org/opensearch/action/support/replication/ReplicationProxy.java new file mode 100644 index 0000000000000..e098ea1aed960 --- /dev/null +++ b/server/src/main/java/org/opensearch/action/support/replication/ReplicationProxy.java @@ -0,0 +1,51 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.action.support.replication; + +import org.opensearch.cluster.routing.ShardRouting; + +import java.util.function.Consumer; + +/** + * Used for performing any replication operation on replicas. Depending on the implementation, the replication call + * can fanout or stops here. + * + * @opensearch.internal + */ +public abstract class ReplicationProxy { + + /** + * Depending on the actual implementation and the passed {@link ReplicationMode}, the replication + * mode is determined using which the replication request is performed on the replica or not. + * + * @param proxyRequest replication proxy request + * @param originalPerformOnReplicaConsumer original performOnReplica method passed as consumer + */ + public void performOnReplicaProxy( + ReplicationProxyRequest proxyRequest, + Consumer> originalPerformOnReplicaConsumer + ) { + ReplicationMode replicationMode = determineReplicationMode(proxyRequest.getShardRouting(), proxyRequest.getPrimaryRouting()); + // If the replication modes are 1. Logical replication or 2. Primary term validation, we let the call get performed on the + // replica shard. + if (replicationMode == ReplicationMode.FULL_REPLICATION || replicationMode == ReplicationMode.PRIMARY_TERM_VALIDATION) { + originalPerformOnReplicaConsumer.accept(proxyRequest); + } + } + + /** + * Determines what is the replication mode basis the constructor arguments of the implementation and the current + * replication mode aware shard routing. + * + * @param shardRouting replication mode aware ShardRouting + * @param primaryRouting primary ShardRouting + * @return the determined replication mode. + */ + abstract ReplicationMode determineReplicationMode(final ShardRouting shardRouting, final ShardRouting primaryRouting); +} diff --git a/server/src/main/java/org/opensearch/action/support/replication/ReplicationProxyFactory.java b/server/src/main/java/org/opensearch/action/support/replication/ReplicationProxyFactory.java new file mode 100644 index 0000000000000..a2bbf58fb9100 --- /dev/null +++ b/server/src/main/java/org/opensearch/action/support/replication/ReplicationProxyFactory.java @@ -0,0 +1,29 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.action.support.replication; + +import org.opensearch.index.shard.IndexShard; + +/** + * Factory that returns the {@link ReplicationProxy} instance basis the {@link ReplicationMode}. + * + * @opensearch.internal + */ +public class ReplicationProxyFactory { + + public static ReplicationProxy create( + final IndexShard indexShard, + final ReplicationMode replicationModeOverride + ) { + if (indexShard.isRemoteTranslogEnabled()) { + return new ReplicationModeAwareProxy<>(replicationModeOverride); + } + return new FanoutReplicationProxy<>(); + } +} diff --git a/server/src/main/java/org/opensearch/action/support/replication/ReplicationProxyRequest.java b/server/src/main/java/org/opensearch/action/support/replication/ReplicationProxyRequest.java new file mode 100644 index 0000000000000..180efd6f423c3 --- /dev/null +++ b/server/src/main/java/org/opensearch/action/support/replication/ReplicationProxyRequest.java @@ -0,0 +1,116 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.action.support.replication; + +import org.opensearch.cluster.routing.ShardRouting; + +import java.util.Objects; + +/** + * This is proxy wrapper over the replication request whose object can be created using the Builder present inside. + * + * @opensearch.internal + */ +public class ReplicationProxyRequest { + + private final ShardRouting shardRouting; + + private final ShardRouting primaryRouting; + + private final long globalCheckpoint; + + private final long maxSeqNoOfUpdatesOrDeletes; + + private final PendingReplicationActions pendingReplicationActions; + + private final ReplicaRequest replicaRequest; + + private ReplicationProxyRequest( + ShardRouting shardRouting, + ShardRouting primaryRouting, + long globalCheckpoint, + long maxSeqNoOfUpdatesOrDeletes, + PendingReplicationActions pendingReplicationActions, + ReplicaRequest replicaRequest + ) { + this.shardRouting = Objects.requireNonNull(shardRouting); + this.primaryRouting = Objects.requireNonNull(primaryRouting); + this.globalCheckpoint = globalCheckpoint; + this.maxSeqNoOfUpdatesOrDeletes = maxSeqNoOfUpdatesOrDeletes; + this.pendingReplicationActions = Objects.requireNonNull(pendingReplicationActions); + this.replicaRequest = Objects.requireNonNull(replicaRequest); + } + + public ShardRouting getShardRouting() { + return shardRouting; + } + + public ShardRouting getPrimaryRouting() { + return primaryRouting; + } + + public long getGlobalCheckpoint() { + return globalCheckpoint; + } + + public long getMaxSeqNoOfUpdatesOrDeletes() { + return maxSeqNoOfUpdatesOrDeletes; + } + + public PendingReplicationActions getPendingReplicationActions() { + return pendingReplicationActions; + } + + public ReplicaRequest getReplicaRequest() { + return replicaRequest; + } + + /** + * Builder of ReplicationProxyRequest. + * + * @opensearch.internal + */ + public static class Builder { + + private final ShardRouting shardRouting; + private final ShardRouting primaryRouting; + private final long globalCheckpoint; + private final long maxSeqNoOfUpdatesOrDeletes; + private final PendingReplicationActions pendingReplicationActions; + private final ReplicaRequest replicaRequest; + + public Builder( + ShardRouting shardRouting, + ShardRouting primaryRouting, + long globalCheckpoint, + long maxSeqNoOfUpdatesOrDeletes, + PendingReplicationActions pendingReplicationActions, + ReplicaRequest replicaRequest + ) { + this.shardRouting = shardRouting; + this.primaryRouting = primaryRouting; + this.globalCheckpoint = globalCheckpoint; + this.maxSeqNoOfUpdatesOrDeletes = maxSeqNoOfUpdatesOrDeletes; + this.pendingReplicationActions = pendingReplicationActions; + this.replicaRequest = replicaRequest; + } + + public ReplicationProxyRequest build() { + return new ReplicationProxyRequest<>( + shardRouting, + primaryRouting, + globalCheckpoint, + maxSeqNoOfUpdatesOrDeletes, + pendingReplicationActions, + replicaRequest + ); + } + + } +} diff --git a/server/src/main/java/org/opensearch/action/support/replication/TransportReplicationAction.java b/server/src/main/java/org/opensearch/action/support/replication/TransportReplicationAction.java index 9d3ee8e49e8c2..0a0904a1b3aaa 100644 --- a/server/src/main/java/org/opensearch/action/support/replication/TransportReplicationAction.java +++ b/server/src/main/java/org/opensearch/action/support/replication/TransportReplicationAction.java @@ -258,6 +258,19 @@ protected ReplicationOperation.Replicas newReplicasProxy() { return new ReplicasProxy(); } + /** + * This method is used for defining the {@link ReplicationMode} override per {@link TransportReplicationAction}. + * + * @param indexShard index shard used to determining the policy. + * @return the overridden replication mode. + */ + protected ReplicationMode getReplicationMode(IndexShard indexShard) { + if (indexShard.isRemoteTranslogEnabled()) { + return ReplicationMode.NO_REPLICATION; + } + return ReplicationMode.FULL_REPLICATION; + } + protected abstract Response newResponseInstance(StreamInput in) throws IOException; /** @@ -533,7 +546,11 @@ public void handleException(TransportException exp) { actionName, primaryRequest.getPrimaryTerm(), initialRetryBackoffBound, - retryTimeout + retryTimeout, + ReplicationProxyFactory.create( + primaryShardReference.indexShard, + getReplicationMode(primaryShardReference.indexShard) + ) ).execute(); } } catch (Exception e) { diff --git a/server/src/main/java/org/opensearch/index/seqno/ReplicationTracker.java b/server/src/main/java/org/opensearch/index/seqno/ReplicationTracker.java index 701dec069d946..a40048e7b9781 100644 --- a/server/src/main/java/org/opensearch/index/seqno/ReplicationTracker.java +++ b/server/src/main/java/org/opensearch/index/seqno/ReplicationTracker.java @@ -622,7 +622,9 @@ public synchronized void renewPeerRecoveryRetentionLeases() { * If this shard copy is tracked then we got here here via a rolling upgrade from an older version that doesn't * create peer recovery retention leases for every shard copy. */ - assert checkpoints.get(shardRouting.allocationId().getId()).tracked == false + assert (checkpoints.get(shardRouting.allocationId().getId()).tracked + && checkpoints.get(shardRouting.allocationId().getId()).replicated == false) + || checkpoints.get(shardRouting.allocationId().getId()).tracked == false || hasAllPeerRecoveryRetentionLeases == false; return false; } @@ -680,20 +682,29 @@ public static class CheckpointState implements Writeable { */ long globalCheckpoint; /** - * whether this shard is treated as in-sync and thus contributes to the global checkpoint calculation + * When a shard is in-sync, it is capable of being promoted as the primary during a failover. An in-sync shard + * contributes to global checkpoint calculation on the primary iff {@link CheckpointState#replicated} is true. */ boolean inSync; /** - * whether this shard is tracked in the replication group, i.e., should receive document updates from the primary. + * whether this shard is tracked in the replication group and has localTranslog, i.e., should receive document updates + * from the primary. Tracked shards with localTranslog would have corresponding retention leases on the primary shard's + * {@link ReplicationTracker}. */ boolean tracked; - public CheckpointState(long localCheckpoint, long globalCheckpoint, boolean inSync, boolean tracked) { + /** + * Whether the replication requests to the primary are replicated to the concerned shard or not. + */ + boolean replicated; + + public CheckpointState(long localCheckpoint, long globalCheckpoint, boolean inSync, boolean tracked, boolean replicated) { this.localCheckpoint = localCheckpoint; this.globalCheckpoint = globalCheckpoint; this.inSync = inSync; this.tracked = tracked; + this.replicated = replicated; } public CheckpointState(StreamInput in) throws IOException { @@ -701,6 +712,11 @@ public CheckpointState(StreamInput in) throws IOException { this.globalCheckpoint = in.readZLong(); this.inSync = in.readBoolean(); this.tracked = in.readBoolean(); + if (in.getVersion().onOrAfter(Version.CURRENT)) { + this.replicated = in.readBoolean(); + } else { + this.replicated = true; + } } @Override @@ -709,13 +725,14 @@ public void writeTo(StreamOutput out) throws IOException { out.writeZLong(globalCheckpoint); out.writeBoolean(inSync); out.writeBoolean(tracked); + out.writeBoolean(replicated); } /** * Returns a full copy of this object */ public CheckpointState copy() { - return new CheckpointState(localCheckpoint, globalCheckpoint, inSync, tracked); + return new CheckpointState(localCheckpoint, globalCheckpoint, inSync, tracked, replicated); } public long getLocalCheckpoint() { @@ -737,6 +754,8 @@ public String toString() { + inSync + ", tracked=" + tracked + + ", replicated=" + + replicated + '}'; } @@ -750,7 +769,8 @@ public boolean equals(Object o) { if (localCheckpoint != that.localCheckpoint) return false; if (globalCheckpoint != that.globalCheckpoint) return false; if (inSync != that.inSync) return false; - return tracked == that.tracked; + if (tracked != that.tracked) return false; + return replicated == that.replicated; } @Override @@ -759,6 +779,7 @@ public int hashCode() { result = 31 * result + Long.hashCode(globalCheckpoint); result = 31 * result + Boolean.hashCode(inSync); result = 31 * result + Boolean.hashCode(tracked); + result = 31 * result + Boolean.hashCode(replicated); return result; } } @@ -774,7 +795,7 @@ public synchronized ObjectLongMap getInSyncGlobalCheckpoints() { final ObjectLongMap globalCheckpoints = new ObjectLongHashMap<>(checkpoints.size()); // upper bound on the size checkpoints.entrySet() .stream() - .filter(e -> e.getValue().inSync) + .filter(e -> e.getValue().inSync && e.getValue().replicated) .forEach(e -> globalCheckpoints.put(e.getKey(), e.getValue().globalCheckpoint)); return globalCheckpoints; } @@ -833,6 +854,9 @@ private boolean invariant() { // the current shard is marked as in-sync when the global checkpoint tracker operates in primary mode assert !primaryMode || checkpoints.get(shardAllocationId).inSync; + // the current shard is marked as tracked when the global checkpoint tracker operates in primary mode + assert !primaryMode || checkpoints.get(shardAllocationId).tracked; + // the routing table and replication group is set when the global checkpoint tracker operates in primary mode assert !primaryMode || (routingTable != null && replicationGroup != null) : "primary mode but routing table is " + routingTable @@ -902,7 +926,8 @@ private boolean invariant() { if (primaryMode && indexSettings.isSoftDeleteEnabled() && hasAllPeerRecoveryRetentionLeases) { // all tracked shard copies have a corresponding peer-recovery retention lease for (final ShardRouting shardRouting : routingTable.assignedShards()) { - if (checkpoints.get(shardRouting.allocationId().getId()).tracked && !indexSettings().isRemoteTranslogStoreEnabled()) { + final CheckpointState cps = checkpoints.get(shardRouting.allocationId().getId()); + if (cps.tracked && cps.replicated) { assert retentionLeases.contains(getPeerRecoveryRetentionLeaseId(shardRouting)) : "no retention lease for tracked shard [" + shardRouting + "] in " + retentionLeases; assert PEER_RECOVERY_RETENTION_LEASE_SOURCE.equals( @@ -926,7 +951,11 @@ private static long inSyncCheckpointStates( Function reducer ) { final OptionalLong value = reducer.apply( - checkpoints.values().stream().filter(cps -> cps.inSync).mapToLong(function).filter(v -> v != SequenceNumbers.UNASSIGNED_SEQ_NO) + checkpoints.values() + .stream() + .filter(cps -> cps.inSync && cps.replicated) + .mapToLong(function) + .filter(v -> v != SequenceNumbers.UNASSIGNED_SEQ_NO) ); return value.isPresent() ? value.getAsLong() : SequenceNumbers.UNASSIGNED_SEQ_NO; } @@ -1028,6 +1057,11 @@ private ReplicationGroup calculateReplicationGroup() { } else { newVersion = replicationGroup.getVersion() + 1; } + + assert indexSettings().isRemoteTranslogStoreEnabled() + || checkpoints.entrySet().stream().filter(e -> e.getValue().tracked).allMatch(e -> e.getValue().replicated) + : "In absence of remote translog store, all tracked shards must have replication mode as LOGICAL_REPLICATION"; + return new ReplicationGroup( routingTable, checkpoints.entrySet().stream().filter(e -> e.getValue().inSync).map(Map.Entry::getKey).collect(Collectors.toSet()), @@ -1122,10 +1156,11 @@ public synchronized void activatePrimaryMode(final long localCheckpoint) { } /** - * Creates a peer recovery retention lease for this shard, if one does not already exist and this shard is the sole shard copy in the - * replication group. If one does not already exist and yet there are other shard copies in this group then we must have just done - * a rolling upgrade from a version before {@code LegacyESVersion#V_7_4_0}, in which case the missing leases should be created - * asynchronously by the caller using {@link ReplicationTracker#createMissingPeerRecoveryRetentionLeases(ActionListener)}. + * Creates a peer recovery retention lease for this shard, if one does not already exist and this shard is the sole + * shard copy with local translog in the replication group. If one does not already exist and yet there are other + * shard copies in this group then we must have just done a rolling upgrade from a version before {@code LegacyESVersion#V_7_4_0}, + * in which case the missing leases should be created asynchronously by the caller using + * {@link ReplicationTracker#createMissingPeerRecoveryRetentionLeases(ActionListener)}. */ private void addPeerRecoveryRetentionLeaseForSolePrimary() { assert primaryMode; @@ -1134,7 +1169,8 @@ private void addPeerRecoveryRetentionLeaseForSolePrimary() { final ShardRouting primaryShard = routingTable.primaryShard(); final String leaseId = getPeerRecoveryRetentionLeaseId(primaryShard); if (retentionLeases.get(leaseId) == null) { - if (replicationGroup.getReplicationTargets().equals(Collections.singletonList(primaryShard))) { + if (replicationGroup.getReplicationTargets().equals(Collections.singletonList(primaryShard)) + || indexSettings().isRemoteTranslogStoreEnabled()) { assert primaryShard.allocationId().getId().equals(shardAllocationId) : routingTable.assignedShards() + " vs " + shardAllocationId; @@ -1197,6 +1233,12 @@ public synchronized void updateFromClusterManager( boolean removedEntries = checkpoints.keySet() .removeIf(aid -> !inSyncAllocationIds.contains(aid) && !initializingAllocationIds.contains(aid)); + final ShardRouting primary = routingTable.primaryShard(); + final String primaryAllocationId = primary.allocationId().getId(); + final String primaryTargetAllocationId = primary.relocating() + ? primary.getTargetRelocatingShard().allocationId().getId() + : null; + if (primaryMode) { // add new initializingIds that are missing locally. These are fresh shard copies - and not in-sync for (String initializingId : initializingAllocationIds) { @@ -1207,7 +1249,16 @@ public synchronized void updateFromClusterManager( + " as in-sync but it does not exist locally"; final long localCheckpoint = SequenceNumbers.UNASSIGNED_SEQ_NO; final long globalCheckpoint = localCheckpoint; - checkpoints.put(initializingId, new CheckpointState(localCheckpoint, globalCheckpoint, inSync, inSync)); + checkpoints.put( + initializingId, + new CheckpointState( + localCheckpoint, + globalCheckpoint, + inSync, + inSync, + isReplicated(initializingId, primaryAllocationId, primaryTargetAllocationId) + ) + ); } } if (removedEntries) { @@ -1217,12 +1268,30 @@ public synchronized void updateFromClusterManager( for (String initializingId : initializingAllocationIds) { final long localCheckpoint = SequenceNumbers.UNASSIGNED_SEQ_NO; final long globalCheckpoint = localCheckpoint; - checkpoints.put(initializingId, new CheckpointState(localCheckpoint, globalCheckpoint, false, false)); + checkpoints.put( + initializingId, + new CheckpointState( + localCheckpoint, + globalCheckpoint, + false, + false, + isReplicated(initializingId, primaryAllocationId, primaryTargetAllocationId) + ) + ); } for (String inSyncId : inSyncAllocationIds) { final long localCheckpoint = SequenceNumbers.UNASSIGNED_SEQ_NO; final long globalCheckpoint = localCheckpoint; - checkpoints.put(inSyncId, new CheckpointState(localCheckpoint, globalCheckpoint, true, true)); + checkpoints.put( + inSyncId, + new CheckpointState( + localCheckpoint, + globalCheckpoint, + true, + true, + isReplicated(inSyncId, primaryAllocationId, primaryTargetAllocationId) + ) + ); } } appliedClusterStateVersion = applyingClusterStateVersion; @@ -1237,6 +1306,26 @@ public synchronized void updateFromClusterManager( assert invariant(); } + /** + * Returns whether the requests are replicated considering the remote translog existence, current/primary/primary target allocation ids. + * + * @param allocationId given allocation id + * @param primaryAllocationId primary allocation id + * @param primaryTargetAllocationId primary target allocation id + * @return the replication mode. + */ + private boolean isReplicated(String allocationId, String primaryAllocationId, String primaryTargetAllocationId) { + // If remote translog is enabled, then returns replication mode checking current allocation id against the + // primary and primary target allocation id. + // If remote translog is enabled, then returns true if given allocation id matches the primary or it's relocation target allocation + // id. + if (indexSettings().isRemoteTranslogStoreEnabled()) { + return (allocationId.equals(primaryAllocationId) || allocationId.equals(primaryTargetAllocationId)); + } + // For other case which is local translog, return true as the requests are replicated to all shards in the replication group. + return true; + } + /** * Notifies the tracker of the current allocation IDs in the cluster state. * @param applyingClusterStateVersion the cluster state version being applied when updating the allocation IDs from the cluster-manager @@ -1298,13 +1387,14 @@ public synchronized void markAllocationIdAsInSync(final String allocationId, fin updateLocalCheckpoint(allocationId, cps, localCheckpoint); // if it was already in-sync (because of a previously failed recovery attempt), global checkpoint must have been // stuck from advancing - assert !cps.inSync || (cps.localCheckpoint >= getGlobalCheckpoint()) : "shard copy " + assert !cps.inSync || cps.localCheckpoint >= getGlobalCheckpoint() || cps.replicated == false : "shard copy " + allocationId + " that's already in-sync should have a local checkpoint " + cps.localCheckpoint + " that's above the global checkpoint " - + getGlobalCheckpoint(); - if (cps.localCheckpoint < getGlobalCheckpoint()) { + + getGlobalCheckpoint() + + " or it's not replicated"; + if (cps.replicated && cps.localCheckpoint < getGlobalCheckpoint()) { pendingInSync.add(allocationId); try { while (true) { @@ -1375,7 +1465,7 @@ public synchronized void updateLocalCheckpoint(final String allocationId, final logger.trace("marked [{}] as in-sync", allocationId); notifyAllWaiters(); } - if (increasedLocalCheckpoint && pending == false) { + if (cps.replicated && increasedLocalCheckpoint && pending == false) { updateGlobalCheckpointOnPrimary(); } assert invariant(); @@ -1395,7 +1485,7 @@ private static long computeGlobalCheckpoint( return fallback; } for (final CheckpointState cps : localCheckpoints) { - if (cps.inSync) { + if (cps.inSync && cps.replicated) { if (cps.localCheckpoint == SequenceNumbers.UNASSIGNED_SEQ_NO) { // unassigned in-sync replica return fallback; diff --git a/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java b/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java index cc51082639cdb..d2fc354cf9298 100644 --- a/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java +++ b/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java @@ -40,6 +40,8 @@ import java.io.IOException; import java.util.Objects; +import org.opensearch.action.support.replication.ReplicationMode; + /** * Replication action responsible for publishing checkpoint to a replica shard. * @@ -93,6 +95,14 @@ protected void doExecute(Task task, PublishCheckpointRequest request, ActionList assert false : "use PublishCheckpointAction#publish"; } + @Override + protected ReplicationMode getReplicationMode(IndexShard indexShard) { + if (indexShard.isRemoteTranslogEnabled()) { + return ReplicationMode.FULL_REPLICATION; + } + return super.getReplicationMode(indexShard); + } + /** * Publish checkpoint request to shard */ diff --git a/server/src/test/java/org/opensearch/action/admin/indices/close/TransportVerifyShardBeforeCloseActionTests.java b/server/src/test/java/org/opensearch/action/admin/indices/close/TransportVerifyShardBeforeCloseActionTests.java index 72c7b5168fe15..a7ffde04314c3 100644 --- a/server/src/test/java/org/opensearch/action/admin/indices/close/TransportVerifyShardBeforeCloseActionTests.java +++ b/server/src/test/java/org/opensearch/action/admin/indices/close/TransportVerifyShardBeforeCloseActionTests.java @@ -32,10 +32,16 @@ package org.opensearch.action.admin.indices.close; import org.apache.lucene.util.SetOnce; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.mockito.ArgumentCaptor; import org.opensearch.action.ActionListener; import org.opensearch.action.admin.indices.flush.FlushRequest; import org.opensearch.action.support.ActionFilters; import org.opensearch.action.support.PlainActionFuture; +import org.opensearch.action.support.replication.FanoutReplicationProxy; import org.opensearch.action.support.replication.PendingReplicationActions; import org.opensearch.action.support.replication.ReplicationOperation; import org.opensearch.action.support.replication.ReplicationResponse; @@ -65,11 +71,6 @@ import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.TransportResponse; import org.opensearch.transport.TransportService; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.mockito.ArgumentCaptor; import java.util.Collections; import java.util.List; @@ -77,21 +78,21 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; -import static org.mockito.Mockito.doNothing; -import static org.opensearch.action.support.replication.ClusterStateCreationUtils.state; -import static org.opensearch.test.ClusterServiceUtils.createClusterService; -import static org.opensearch.test.ClusterServiceUtils.setState; import static org.hamcrest.Matchers.arrayWithSize; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.is; import static org.mockito.Mockito.any; +import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import static org.opensearch.action.support.replication.ClusterStateCreationUtils.state; +import static org.opensearch.test.ClusterServiceUtils.createClusterService; +import static org.opensearch.test.ClusterServiceUtils.setState; public class TransportVerifyShardBeforeCloseActionTests extends OpenSearchTestCase { @@ -290,7 +291,8 @@ public void testUnavailableShardsMarkedAsStale() throws Exception { "test", primaryTerm, TimeValue.timeValueMillis(20), - TimeValue.timeValueSeconds(60) + TimeValue.timeValueSeconds(60), + new FanoutReplicationProxy<>() ); operation.execute(); diff --git a/server/src/test/java/org/opensearch/action/resync/TransportResyncReplicationActionTests.java b/server/src/test/java/org/opensearch/action/resync/TransportResyncReplicationActionTests.java index 2ebca16519258..acf46e2a63333 100644 --- a/server/src/test/java/org/opensearch/action/resync/TransportResyncReplicationActionTests.java +++ b/server/src/test/java/org/opensearch/action/resync/TransportResyncReplicationActionTests.java @@ -31,10 +31,13 @@ package org.opensearch.action.resync; +import org.junit.AfterClass; +import org.junit.BeforeClass; import org.opensearch.Version; import org.opensearch.action.ActionListener; import org.opensearch.action.support.ActionFilters; import org.opensearch.action.support.PlainActionFuture; +import org.opensearch.action.support.replication.PendingReplicationActions; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.action.shard.ShardStateAction; import org.opensearch.cluster.block.ClusterBlocks; @@ -66,30 +69,29 @@ import org.opensearch.threadpool.TestThreadPool; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.nio.MockNioTransport; -import org.junit.AfterClass; -import org.junit.BeforeClass; import java.nio.charset.Charset; import java.util.Collections; import java.util.HashSet; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; -import static org.opensearch.action.support.replication.ClusterStateCreationUtils.state; -import static org.opensearch.test.ClusterServiceUtils.createClusterService; -import static org.opensearch.test.ClusterServiceUtils.setState; -import static org.opensearch.transport.TransportService.NOOP_TRANSPORT_INTERCEPTOR; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.nullValue; import static org.mockito.Mockito.any; import static org.mockito.Mockito.anyString; -import static org.mockito.Mockito.eq; import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import static org.opensearch.action.support.replication.ClusterStateCreationUtils.state; +import static org.opensearch.test.ClusterServiceUtils.createClusterService; +import static org.opensearch.test.ClusterServiceUtils.setState; +import static org.opensearch.transport.TransportService.NOOP_TRANSPORT_INTERCEPTOR; public class TransportResyncReplicationActionTests extends OpenSearchTestCase { @@ -156,23 +158,26 @@ public void testResyncDoesNotBlockOnPrimaryAction() throws Exception { final AtomicInteger acquiredPermits = new AtomicInteger(); final IndexShard indexShard = mock(IndexShard.class); + final PendingReplicationActions replicationActions = new PendingReplicationActions(shardId, threadPool); when(indexShard.indexSettings()).thenReturn(new IndexSettings(indexMetadata, Settings.EMPTY)); when(indexShard.shardId()).thenReturn(shardId); when(indexShard.routingEntry()).thenReturn(primaryShardRouting); when(indexShard.getPendingPrimaryTerm()).thenReturn(primaryTerm); when(indexShard.getOperationPrimaryTerm()).thenReturn(primaryTerm); when(indexShard.getActiveOperationsCount()).then(i -> acquiredPermits.get()); + when(indexShard.getPendingReplicationActions()).thenReturn(replicationActions); doAnswer(invocation -> { ActionListener callback = (ActionListener) invocation.getArguments()[0]; acquiredPermits.incrementAndGet(); callback.onResponse(acquiredPermits::decrementAndGet); return null; }).when(indexShard).acquirePrimaryOperationPermit(any(ActionListener.class), anyString(), any(), eq(true)); + Set trackedAllocationIds = shardRoutingTable.getAllAllocationIds(); when(indexShard.getReplicationGroup()).thenReturn( new ReplicationGroup( shardRoutingTable, clusterService.state().metadata().index(index).inSyncAllocationIds(shardId.id()), - shardRoutingTable.getAllAllocationIds(), + trackedAllocationIds, 0 ) ); diff --git a/server/src/test/java/org/opensearch/action/support/replication/ReplicationOperationTests.java b/server/src/test/java/org/opensearch/action/support/replication/ReplicationOperationTests.java index 8a4cdfc953bf8..3a689e356bbdf 100644 --- a/server/src/test/java/org/opensearch/action/support/replication/ReplicationOperationTests.java +++ b/server/src/test/java/org/opensearch/action/support/replication/ReplicationOperationTests.java @@ -45,6 +45,7 @@ import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; +import org.opensearch.cluster.routing.AllocationId; import org.opensearch.cluster.routing.IndexShardRoutingTable; import org.opensearch.cluster.routing.ShardRouting; import org.opensearch.cluster.routing.ShardRoutingState; @@ -80,14 +81,18 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; -import static org.opensearch.action.support.replication.ClusterStateCreationUtils.state; -import static org.opensearch.action.support.replication.ClusterStateCreationUtils.stateWithActivePrimary; import static org.hamcrest.Matchers.arrayWithSize; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.notNullValue; import static org.hamcrest.Matchers.nullValue; +import static org.opensearch.action.support.replication.ClusterStateCreationUtils.state; +import static org.opensearch.action.support.replication.ClusterStateCreationUtils.stateWithActivePrimary; +import static org.opensearch.action.support.replication.ReplicationOperation.RetryOnPrimaryException; +import static org.opensearch.cluster.routing.TestShardRouting.newShardRouting; public class ReplicationOperationTests extends OpenSearchTestCase { @@ -157,7 +162,14 @@ public void testReplication() throws Exception { final TestReplicaProxy replicasProxy = new TestReplicaProxy(simulatedFailures); final TestPrimary primary = new TestPrimary(primaryShard, () -> replicationGroup, threadPool); - final TestReplicationOperation op = new TestReplicationOperation(request, primary, listener, replicasProxy, primaryTerm); + final TestReplicationOperation op = new TestReplicationOperation( + request, + primary, + listener, + replicasProxy, + primaryTerm, + new FanoutReplicationProxy<>() + ); op.execute(); assertThat("request was not processed on primary", request.processedOnPrimary.get(), equalTo(true)); assertThat(request.processedOnReplicas, equalTo(expectedReplicas)); @@ -179,6 +191,199 @@ public void testReplication() throws Exception { assertThat(primary.knownGlobalCheckpoints, equalTo(replicasProxy.generatedGlobalCheckpoints)); } + public void testReplicationWithRemoteTranslogEnabled() throws Exception { + Set initializingIds = new HashSet<>(); + IntStream.range(0, randomIntBetween(2, 5)).forEach(x -> initializingIds.add(AllocationId.newInitializing())); + Set activeIds = new HashSet<>(); + IntStream.range(0, randomIntBetween(2, 5)).forEach(x -> activeIds.add(AllocationId.newInitializing())); + + AllocationId primaryId = activeIds.iterator().next(); + + ShardId shardId = new ShardId("test", "_na_", 0); + IndexShardRoutingTable.Builder builder = new IndexShardRoutingTable.Builder(shardId); + final ShardRouting primaryShard = newShardRouting( + shardId, + nodeIdFromAllocationId(primaryId), + null, + true, + ShardRoutingState.STARTED, + primaryId + ); + initializingIds.forEach( + aId -> builder.addShard(newShardRouting(shardId, nodeIdFromAllocationId(aId), null, false, ShardRoutingState.INITIALIZING, aId)) + ); + activeIds.stream() + .filter(aId -> !aId.equals(primaryId)) + .forEach( + aId -> builder.addShard(newShardRouting(shardId, nodeIdFromAllocationId(aId), null, false, ShardRoutingState.STARTED, aId)) + ); + builder.addShard(primaryShard); + IndexShardRoutingTable routingTable = builder.build(); + + Set inSyncAllocationIds = activeIds.stream().map(AllocationId::getId).collect(Collectors.toSet()); + ReplicationGroup replicationGroup = new ReplicationGroup(routingTable, inSyncAllocationIds, inSyncAllocationIds, 0); + List replicationTargets = replicationGroup.getReplicationTargets(); + assertEquals(inSyncAllocationIds.size(), replicationTargets.size()); + assertTrue( + replicationTargets.stream().map(sh -> sh.allocationId().getId()).collect(Collectors.toSet()).containsAll(inSyncAllocationIds) + ); + + Request request = new Request(shardId); + PlainActionFuture listener = new PlainActionFuture<>(); + Map simulatedFailures = new HashMap<>(); + TestReplicaProxy replicasProxy = new TestReplicaProxy(simulatedFailures); + TestPrimary primary = new TestPrimary(primaryShard, () -> replicationGroup, threadPool); + final TestReplicationOperation op = new TestReplicationOperation( + request, + primary, + listener, + replicasProxy, + 0, + new ReplicationModeAwareProxy<>(ReplicationMode.NO_REPLICATION) + ); + op.execute(); + assertTrue("request was not processed on primary", request.processedOnPrimary.get()); + assertEquals(0, request.processedOnReplicas.size()); + assertEquals(0, replicasProxy.failedReplicas.size()); + assertEquals(0, replicasProxy.markedAsStaleCopies.size()); + assertTrue("post replication operations not run on primary", request.runPostReplicationActionsOnPrimary.get()); + assertTrue("listener is not marked as done", listener.isDone()); + + ShardInfo shardInfo = listener.actionGet().getShardInfo(); + assertEquals(1 + initializingIds.size(), shardInfo.getTotal()); + } + + public void testPrimaryToPrimaryReplicationWithRemoteTranslogEnabled() throws Exception { + Set initializingIds = new HashSet<>(); + IntStream.range(0, randomIntBetween(2, 5)).forEach(x -> initializingIds.add(AllocationId.newInitializing())); + Set activeIds = new HashSet<>(); + IntStream.range(0, randomIntBetween(2, 5)).forEach(x -> activeIds.add(AllocationId.newInitializing())); + + AllocationId primaryId = AllocationId.newRelocation(AllocationId.newInitializing()); + AllocationId relocationTargetId = AllocationId.newTargetRelocation(primaryId); + + ShardId shardId = new ShardId("test", "_na_", 0); + IndexShardRoutingTable.Builder builder = new IndexShardRoutingTable.Builder(shardId); + final ShardRouting primaryShard = newShardRouting( + shardId, + nodeIdFromAllocationId(primaryId), + nodeIdFromAllocationId(relocationTargetId), + true, + ShardRoutingState.RELOCATING, + primaryId + ); + initializingIds.forEach( + aId -> builder.addShard(newShardRouting(shardId, nodeIdFromAllocationId(aId), null, false, ShardRoutingState.INITIALIZING, aId)) + ); + activeIds.forEach( + aId -> builder.addShard(newShardRouting(shardId, nodeIdFromAllocationId(aId), null, false, ShardRoutingState.STARTED, aId)) + ); + builder.addShard(primaryShard); + IndexShardRoutingTable routingTable = builder.build(); + + // Add primary and it's relocating target to activeIds + activeIds.add(primaryId); + activeIds.add(relocationTargetId); + + Set inSyncAllocationIds = activeIds.stream().map(AllocationId::getId).collect(Collectors.toSet()); + ReplicationGroup replicationGroup = new ReplicationGroup(routingTable, inSyncAllocationIds, inSyncAllocationIds, 0); + List replicationTargets = replicationGroup.getReplicationTargets(); + assertEquals(inSyncAllocationIds.size(), replicationTargets.size()); + assertTrue( + replicationTargets.stream().map(sh -> sh.allocationId().getId()).collect(Collectors.toSet()).containsAll(inSyncAllocationIds) + ); + + Request request = new Request(shardId); + PlainActionFuture listener = new PlainActionFuture<>(); + Map simulatedFailures = new HashMap<>(); + TestReplicaProxy replicasProxy = new TestReplicaProxy(simulatedFailures); + TestPrimary primary = new TestPrimary(primaryShard, () -> replicationGroup, threadPool); + final TestReplicationOperation op = new TestReplicationOperation( + request, + primary, + listener, + replicasProxy, + 0, + new ReplicationModeAwareProxy<>(ReplicationMode.NO_REPLICATION) + ); + op.execute(); + assertTrue("request was not processed on primary", request.processedOnPrimary.get()); + assertEquals(1, request.processedOnReplicas.size()); + assertEquals(0, replicasProxy.failedReplicas.size()); + assertEquals(0, replicasProxy.markedAsStaleCopies.size()); + assertTrue("post replication operations not run on primary", request.runPostReplicationActionsOnPrimary.get()); + assertTrue("listener is not marked as done", listener.isDone()); + + ShardInfo shardInfo = listener.actionGet().getShardInfo(); + assertEquals(2 + initializingIds.size(), shardInfo.getTotal()); + } + + public void testForceReplicationWithRemoteTranslogEnabled() throws Exception { + Set initializingIds = new HashSet<>(); + IntStream.range(0, randomIntBetween(2, 5)).forEach(x -> initializingIds.add(AllocationId.newInitializing())); + Set activeIds = new HashSet<>(); + IntStream.range(0, randomIntBetween(2, 5)).forEach(x -> activeIds.add(AllocationId.newInitializing())); + + AllocationId primaryId = activeIds.iterator().next(); + + ShardId shardId = new ShardId("test", "_na_", 0); + IndexShardRoutingTable.Builder builder = new IndexShardRoutingTable.Builder(shardId); + final ShardRouting primaryShard = newShardRouting( + shardId, + nodeIdFromAllocationId(primaryId), + null, + true, + ShardRoutingState.STARTED, + primaryId + ); + initializingIds.forEach( + aId -> builder.addShard(newShardRouting(shardId, nodeIdFromAllocationId(aId), null, false, ShardRoutingState.INITIALIZING, aId)) + ); + activeIds.stream() + .filter(aId -> !aId.equals(primaryId)) + .forEach( + aId -> builder.addShard(newShardRouting(shardId, nodeIdFromAllocationId(aId), null, false, ShardRoutingState.STARTED, aId)) + ); + builder.addShard(primaryShard); + IndexShardRoutingTable routingTable = builder.build(); + + Set inSyncAllocationIds = activeIds.stream().map(AllocationId::getId).collect(Collectors.toSet()); + ReplicationGroup replicationGroup = new ReplicationGroup(routingTable, inSyncAllocationIds, inSyncAllocationIds, 0); + List replicationTargets = replicationGroup.getReplicationTargets(); + assertEquals(inSyncAllocationIds.size(), replicationTargets.size()); + assertTrue( + replicationTargets.stream().map(sh -> sh.allocationId().getId()).collect(Collectors.toSet()).containsAll(inSyncAllocationIds) + ); + + Request request = new Request(shardId); + PlainActionFuture listener = new PlainActionFuture<>(); + Map simulatedFailures = new HashMap<>(); + TestReplicaProxy replicasProxy = new TestReplicaProxy(simulatedFailures); + TestPrimary primary = new TestPrimary(primaryShard, () -> replicationGroup, threadPool); + final TestReplicationOperation op = new TestReplicationOperation( + request, + primary, + listener, + replicasProxy, + 0, + new FanoutReplicationProxy<>() + ); + op.execute(); + assertTrue("request was not processed on primary", request.processedOnPrimary.get()); + assertEquals(activeIds.size() - 1, request.processedOnReplicas.size()); + assertEquals(0, replicasProxy.failedReplicas.size()); + assertEquals(0, replicasProxy.markedAsStaleCopies.size()); + assertTrue("post replication operations not run on primary", request.runPostReplicationActionsOnPrimary.get()); + assertTrue("listener is not marked as done", listener.isDone()); + + ShardInfo shardInfo = listener.actionGet().getShardInfo(); + assertEquals(activeIds.size() + initializingIds.size(), shardInfo.getTotal()); + } + + static String nodeIdFromAllocationId(final AllocationId allocationId) { + return "n-" + allocationId.getId().substring(0, 8); + } + public void testRetryTransientReplicationFailure() throws Exception { final String index = "test"; final ShardId shardId = new ShardId(index, "_na_", 0); @@ -242,7 +447,8 @@ public void testRetryTransientReplicationFailure() throws Exception { replicasProxy, primaryTerm, TimeValue.timeValueMillis(20), - TimeValue.timeValueSeconds(60) + TimeValue.timeValueSeconds(60), + new FanoutReplicationProxy<>() ); op.execute(); assertThat("request was not processed on primary", request.processedOnPrimary.get(), equalTo(true)); @@ -379,7 +585,14 @@ public void failShard(String message, Exception exception) { assertTrue(primaryFailed.compareAndSet(false, true)); } }; - final TestReplicationOperation op = new TestReplicationOperation(request, primary, listener, replicasProxy, primaryTerm); + final TestReplicationOperation op = new TestReplicationOperation( + request, + primary, + listener, + replicasProxy, + primaryTerm, + new FanoutReplicationProxy<>() + ); op.execute(); assertThat("request was not processed on primary", request.processedOnPrimary.get(), equalTo(true)); @@ -389,7 +602,7 @@ public void failShard(String message, Exception exception) { } else { assertFalse(primaryFailed.get()); } - assertListenerThrows("should throw exception to trigger retry", listener, ReplicationOperation.RetryOnPrimaryException.class); + assertListenerThrows("should throw exception to trigger retry", listener, RetryOnPrimaryException.class); } public void testAddedReplicaAfterPrimaryOperation() throws Exception { @@ -438,7 +651,14 @@ public void perform(Request request, ActionListener listener) { Request request = new Request(shardId); PlainActionFuture listener = new PlainActionFuture<>(); - final TestReplicationOperation op = new TestReplicationOperation(request, primary, listener, new TestReplicaProxy(), primaryTerm); + final TestReplicationOperation op = new TestReplicationOperation( + request, + primary, + listener, + new TestReplicaProxy(), + primaryTerm, + new FanoutReplicationProxy<>() + ); op.execute(); assertThat("request was not processed on primary", request.processedOnPrimary.get(), equalTo(true)); @@ -493,7 +713,8 @@ public void testWaitForActiveShards() throws Exception { logger, threadPool, "test", - primaryTerm + primaryTerm, + new FanoutReplicationProxy<>() ); if (passesActiveShardCheck) { @@ -554,7 +775,14 @@ public void updateLocalCheckpointForShard(String allocationId, long checkpoint) final PlainActionFuture listener = new PlainActionFuture<>(); final ReplicationOperation.Replicas replicas = new TestReplicaProxy(Collections.emptyMap()); - TestReplicationOperation operation = new TestReplicationOperation(request, primary, listener, replicas, primaryTerm); + TestReplicationOperation operation = new TestReplicationOperation( + request, + primary, + listener, + replicas, + primaryTerm, + new FanoutReplicationProxy<>() + ); operation.execute(); assertThat(primaryFailed.get(), equalTo(fatal)); @@ -841,7 +1069,8 @@ class TestReplicationOperation extends ReplicationOperation replicas, long primaryTerm, TimeValue initialRetryBackoffBound, - TimeValue retryTimeout + TimeValue retryTimeout, + ReplicationProxy replicationProxy ) { this( request, @@ -853,7 +1082,8 @@ class TestReplicationOperation extends ReplicationOperation primary, ActionListener listener, Replicas replicas, - long primaryTerm + long primaryTerm, + ReplicationProxy replicationProxy ) { - this(request, primary, listener, replicas, ReplicationOperationTests.this.logger, threadPool, "test", primaryTerm); + this( + request, + primary, + listener, + replicas, + ReplicationOperationTests.this.logger, + threadPool, + "test", + primaryTerm, + replicationProxy + ); } TestReplicationOperation( @@ -875,7 +1116,8 @@ class TestReplicationOperation extends ReplicationOperation replicationProxy ) { this( request, @@ -887,7 +1129,8 @@ class TestReplicationOperation extends ReplicationOperation replicationProxy ) { - super(request, primary, listener, replicas, logger, threadPool, opType, primaryTerm, initialRetryBackoffBound, retryTimeout); + super( + request, + primary, + listener, + replicas, + logger, + threadPool, + opType, + primaryTerm, + initialRetryBackoffBound, + retryTimeout, + replicationProxy + ); } } diff --git a/server/src/test/java/org/opensearch/action/support/replication/TransportReplicationActionTests.java b/server/src/test/java/org/opensearch/action/support/replication/TransportReplicationActionTests.java index 696958c340375..bde483e171d1e 100644 --- a/server/src/test/java/org/opensearch/action/support/replication/TransportReplicationActionTests.java +++ b/server/src/test/java/org/opensearch/action/support/replication/TransportReplicationActionTests.java @@ -33,6 +33,12 @@ package org.opensearch.action.support.replication; import org.apache.lucene.store.AlreadyClosedException; +import org.hamcrest.Matcher; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; import org.opensearch.OpenSearchException; import org.opensearch.Version; import org.opensearch.action.ActionListener; @@ -99,12 +105,6 @@ import org.opensearch.transport.TransportResponse; import org.opensearch.transport.TransportService; import org.opensearch.transport.nio.MockNioTransport; -import org.hamcrest.Matcher; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; import java.io.IOException; import java.util.Collections; @@ -121,11 +121,6 @@ import java.util.stream.Collectors; import static java.util.Collections.singleton; -import static org.opensearch.action.support.replication.ClusterStateCreationUtils.state; -import static org.opensearch.action.support.replication.ClusterStateCreationUtils.stateWithActivePrimary; -import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_WAIT_FOR_ACTIVE_SHARDS; -import static org.opensearch.test.ClusterServiceUtils.createClusterService; -import static org.opensearch.test.ClusterServiceUtils.setState; import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.Matchers.arrayWithSize; import static org.hamcrest.Matchers.equalTo; @@ -138,12 +133,17 @@ import static org.mockito.Mockito.anyInt; import static org.mockito.Mockito.anyLong; import static org.mockito.Mockito.anyString; -import static org.mockito.Mockito.eq; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import static org.opensearch.action.support.replication.ClusterStateCreationUtils.state; +import static org.opensearch.action.support.replication.ClusterStateCreationUtils.stateWithActivePrimary; +import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_WAIT_FOR_ACTIVE_SHARDS; +import static org.opensearch.test.ClusterServiceUtils.createClusterService; +import static org.opensearch.test.ClusterServiceUtils.setState; public class TransportReplicationActionTests extends OpenSearchTestCase { @@ -950,7 +950,8 @@ public void testSeqNoIsSetOnPrimary() { Set inSyncIds = randomBoolean() ? singleton(routingEntry.allocationId().getId()) : clusterService.state().metadata().index(index).inSyncAllocationIds(0); - ReplicationGroup replicationGroup = new ReplicationGroup(shardRoutingTable, inSyncIds, shardRoutingTable.getAllAllocationIds(), 0); + Set trackedAllocationIds = shardRoutingTable.getAllAllocationIds(); + ReplicationGroup replicationGroup = new ReplicationGroup(shardRoutingTable, inSyncIds, trackedAllocationIds, 0); when(shard.getReplicationGroup()).thenReturn(replicationGroup); PendingReplicationActions replicationActions = new PendingReplicationActions(shardId, threadPool); replicationActions.accept(replicationGroup); diff --git a/server/src/test/java/org/opensearch/index/seqno/ReplicationTrackerTestCase.java b/server/src/test/java/org/opensearch/index/seqno/ReplicationTrackerTestCase.java index 34a2d1189d234..17a6bfc8fbd82 100644 --- a/server/src/test/java/org/opensearch/index/seqno/ReplicationTrackerTestCase.java +++ b/server/src/test/java/org/opensearch/index/seqno/ReplicationTrackerTestCase.java @@ -43,6 +43,7 @@ import org.opensearch.test.OpenSearchTestCase; import org.opensearch.test.IndexSettingsModule; +import java.util.Collections; import java.util.Set; import java.util.function.LongConsumer; import java.util.function.LongSupplier; @@ -55,12 +56,13 @@ public abstract class ReplicationTrackerTestCase extends OpenSearchTestCase { ReplicationTracker newTracker( final AllocationId allocationId, final LongConsumer updatedGlobalCheckpoint, - final LongSupplier currentTimeMillisSupplier + final LongSupplier currentTimeMillisSupplier, + final Settings settings ) { return new ReplicationTracker( new ShardId("test", "_na_", 0), allocationId.getId(), - IndexSettingsModule.newIndexSettings("test", Settings.EMPTY), + IndexSettingsModule.newIndexSettings("test", settings), randomNonNegativeLong(), UNASSIGNED_SEQ_NO, updatedGlobalCheckpoint, @@ -70,6 +72,14 @@ ReplicationTracker newTracker( ); } + ReplicationTracker newTracker( + final AllocationId allocationId, + final LongConsumer updatedGlobalCheckpoint, + final LongSupplier currentTimeMillisSupplier + ) { + return newTracker(allocationId, updatedGlobalCheckpoint, currentTimeMillisSupplier, Settings.EMPTY); + } + static final Supplier OPS_BASED_RECOVERY_ALWAYS_REASONABLE = () -> SafeCommitInfo.EMPTY; static String nodeIdFromAllocationId(final AllocationId allocationId) { @@ -77,6 +87,14 @@ static String nodeIdFromAllocationId(final AllocationId allocationId) { } static IndexShardRoutingTable routingTable(final Set initializingIds, final AllocationId primaryId) { + return routingTable(initializingIds, Collections.singleton(primaryId), primaryId); + } + + static IndexShardRoutingTable routingTable( + final Set initializingIds, + final Set activeIds, + final AllocationId primaryId + ) { final ShardId shardId = new ShardId("test", "_na_", 0); final ShardRouting primaryShard = TestShardRouting.newShardRouting( shardId, @@ -86,11 +104,17 @@ static IndexShardRoutingTable routingTable(final Set initializingI ShardRoutingState.STARTED, primaryId ); - return routingTable(initializingIds, primaryShard); + return routingTable(initializingIds, activeIds, primaryShard); } - static IndexShardRoutingTable routingTable(final Set initializingIds, final ShardRouting primaryShard) { + static IndexShardRoutingTable routingTable( + final Set initializingIds, + final Set activeIds, + final ShardRouting primaryShard + ) { + assert initializingIds != null && activeIds != null; assert !initializingIds.contains(primaryShard.allocationId()); + assert activeIds.contains(primaryShard.allocationId()); final ShardId shardId = new ShardId("test", "_na_", 0); final IndexShardRoutingTable.Builder builder = new IndexShardRoutingTable.Builder(shardId); for (final AllocationId initializingId : initializingIds) { @@ -105,6 +129,21 @@ static IndexShardRoutingTable routingTable(final Set initializingI ) ); } + for (final AllocationId activeId : activeIds) { + if (activeId.equals(primaryShard.allocationId())) { + continue; + } + builder.addShard( + TestShardRouting.newShardRouting( + shardId, + nodeIdFromAllocationId(activeId), + null, + false, + ShardRoutingState.STARTED, + activeId + ) + ); + } builder.addShard(primaryShard); diff --git a/server/src/test/java/org/opensearch/index/seqno/ReplicationTrackerTests.java b/server/src/test/java/org/opensearch/index/seqno/ReplicationTrackerTests.java index 66c484cd40cce..8ea64e71fb9dc 100644 --- a/server/src/test/java/org/opensearch/index/seqno/ReplicationTrackerTests.java +++ b/server/src/test/java/org/opensearch/index/seqno/ReplicationTrackerTests.java @@ -437,6 +437,10 @@ public void testWaitForAllocationIdToBeInSync() throws Exception { private AtomicLong updatedGlobalCheckpoint = new AtomicLong(UNASSIGNED_SEQ_NO); + private ReplicationTracker newTracker(final AllocationId allocationId, Settings settings) { + return newTracker(allocationId, updatedGlobalCheckpoint::set, () -> 0L, settings); + } + private ReplicationTracker newTracker(final AllocationId allocationId) { return newTracker(allocationId, updatedGlobalCheckpoint::set, () -> 0L); } @@ -966,7 +970,11 @@ private static FakeClusterState initialState() { relocatingId ); - return new FakeClusterState(initialClusterStateVersion, activeAllocationIds, routingTable(initializingAllocationIds, primaryShard)); + return new FakeClusterState( + initialClusterStateVersion, + activeAllocationIds, + routingTable(initializingAllocationIds, Collections.singleton(primaryShard.allocationId()), primaryShard) + ); } private static void randomLocalCheckpointUpdate(ReplicationTracker gcp) { @@ -1007,6 +1015,7 @@ private static FakeClusterState randomUpdateClusterState(Set allocationI remainingInSyncIds.isEmpty() ? clusterState.inSyncIds : remainingInSyncIds, routingTable( Sets.difference(Sets.union(initializingIdsExceptRelocationTargets, initializingIdsToAdd), initializingIdsToRemove), + Collections.singleton(clusterState.routingTable.primaryShard().allocationId()), clusterState.routingTable.primaryShard() ) ); @@ -1046,9 +1055,20 @@ private static void markAsTrackingAndInSyncQuietly( final ReplicationTracker tracker, final String allocationId, final long localCheckpoint + ) { + markAsTrackingAndInSyncQuietly(tracker, allocationId, localCheckpoint, true); + } + + private static void markAsTrackingAndInSyncQuietly( + final ReplicationTracker tracker, + final String allocationId, + final long localCheckpoint, + final boolean addPRRL ) { try { - addPeerRecoveryRetentionLease(tracker, allocationId); + if (addPRRL) { + addPeerRecoveryRetentionLease(tracker, allocationId); + } tracker.initiateTracking(allocationId); tracker.markAllocationIdAsInSync(allocationId, localCheckpoint); } catch (final InterruptedException e) { @@ -1252,4 +1272,695 @@ public void testPeerRecoveryRetentionLeaseCreationAndRenewal() { ); } + /** + * This test checks that the global checkpoint update mechanism is honored and relies only on the shards that have + * translog stored locally. + */ + public void testGlobalCheckpointUpdateWithRemoteTranslogEnabled() { + final long initialClusterStateVersion = randomNonNegativeLong(); + Map activeWithCheckpoints = randomAllocationsWithLocalCheckpoints(1, 5); + Set active = new HashSet<>(activeWithCheckpoints.keySet()); + Map allocations = new HashMap<>(activeWithCheckpoints); + Map initializingWithCheckpoints = randomAllocationsWithLocalCheckpoints(0, 5); + Set initializing = new HashSet<>(initializingWithCheckpoints.keySet()); + allocations.putAll(initializingWithCheckpoints); + assertThat(allocations.size(), equalTo(active.size() + initializing.size())); + + final AllocationId primaryId = active.iterator().next(); + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(primaryId, settings); + assertThat(tracker.getGlobalCheckpoint(), equalTo(UNASSIGNED_SEQ_NO)); + + long primaryLocalCheckpoint = activeWithCheckpoints.get(primaryId); + + logger.info("--> using allocations"); + allocations.keySet().forEach(aId -> { + final String type; + if (active.contains(aId)) { + type = "active"; + } else if (initializing.contains(aId)) { + type = "init"; + } else { + throw new IllegalStateException(aId + " not found in any map"); + } + logger.info(" - [{}], local checkpoint [{}], [{}]", aId, allocations.get(aId), type); + }); + + tracker.updateFromClusterManager(initialClusterStateVersion, ids(active), routingTable(initializing, primaryId)); + tracker.activatePrimaryMode(NO_OPS_PERFORMED); + assertThat(tracker.getReplicationGroup().getReplicationTargets().size(), equalTo(1)); + initializing.forEach(aId -> markAsTrackingAndInSyncQuietly(tracker, aId.getId(), NO_OPS_PERFORMED, false)); + assertThat(tracker.getReplicationGroup().getReplicationTargets().size(), equalTo(1 + initializing.size())); + Set replicationTargets = tracker.getReplicationGroup() + .getReplicationTargets() + .stream() + .map(ShardRouting::allocationId) + .collect(Collectors.toSet()); + assertTrue(replicationTargets.containsAll(initializing)); + allocations.keySet().forEach(aId -> updateLocalCheckpoint(tracker, aId.getId(), allocations.get(aId))); + + assertEquals(tracker.getGlobalCheckpoint(), primaryLocalCheckpoint); + + // increment checkpoints + active.forEach(aId -> allocations.put(aId, allocations.get(aId) + 1 + randomInt(4))); + initializing.forEach(aId -> allocations.put(aId, allocations.get(aId) + 1 + randomInt(4))); + allocations.keySet().forEach(aId -> updateLocalCheckpoint(tracker, aId.getId(), allocations.get(aId))); + + final long minLocalCheckpointAfterUpdates = allocations.values().stream().min(Long::compareTo).orElse(UNASSIGNED_SEQ_NO); + + // now insert an unknown active/insync id , the checkpoint shouldn't change but a refresh should be requested. + final AllocationId extraId = AllocationId.newInitializing(); + + // first check that adding it without the cluster-manager blessing doesn't change anything. + updateLocalCheckpoint(tracker, extraId.getId(), minLocalCheckpointAfterUpdates + 1 + randomInt(4)); + assertNull(tracker.checkpoints.get(extraId.getId())); + expectThrows(IllegalStateException.class, () -> tracker.initiateTracking(extraId.getId())); + + Set newInitializing = new HashSet<>(initializing); + newInitializing.add(extraId); + tracker.updateFromClusterManager(initialClusterStateVersion + 1, ids(active), routingTable(newInitializing, primaryId)); + + tracker.initiateTracking(extraId.getId()); + + // now notify for the new id + if (randomBoolean()) { + updateLocalCheckpoint(tracker, extraId.getId(), minLocalCheckpointAfterUpdates + 1 + randomInt(4)); + markAsTrackingAndInSyncQuietly(tracker, extraId.getId(), randomInt((int) minLocalCheckpointAfterUpdates), false); + } else { + markAsTrackingAndInSyncQuietly(tracker, extraId.getId(), minLocalCheckpointAfterUpdates + 1 + randomInt(4), false); + } + } + + public void testUpdateFromClusterManagerWithRemoteTranslogEnabled() { + final long initialClusterStateVersion = randomNonNegativeLong(); + Map activeWithCheckpoints = randomAllocationsWithLocalCheckpoints(2, 5); + Set active = new HashSet<>(activeWithCheckpoints.keySet()); + Map allocations = new HashMap<>(activeWithCheckpoints); + Map initializingWithCheckpoints = randomAllocationsWithLocalCheckpoints(0, 5); + Set initializing = new HashSet<>(initializingWithCheckpoints.keySet()); + allocations.putAll(initializingWithCheckpoints); + assertThat(allocations.size(), equalTo(active.size() + initializing.size())); + + final AllocationId primaryId = active.iterator().next(); + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(primaryId, settings); + assertThat(tracker.getGlobalCheckpoint(), equalTo(UNASSIGNED_SEQ_NO)); + + long primaryLocalCheckpoint = activeWithCheckpoints.get(primaryId); + + logger.info("--> using allocations"); + allocations.keySet().forEach(aId -> { + final String type; + if (active.contains(aId)) { + type = "active"; + } else if (initializing.contains(aId)) { + type = "init"; + } else { + throw new IllegalStateException(aId + " not found in any map"); + } + logger.info(" - [{}], local checkpoint [{}], [{}]", aId, allocations.get(aId), type); + }); + + tracker.updateFromClusterManager(initialClusterStateVersion, ids(active), routingTable(initializing, active, primaryId)); + tracker.activatePrimaryMode(NO_OPS_PERFORMED); + assertEquals(tracker.getReplicationGroup().getReplicationTargets().size(), active.size()); + initializing.forEach(aId -> markAsTrackingAndInSyncQuietly(tracker, aId.getId(), NO_OPS_PERFORMED, false)); + assertEquals(tracker.getReplicationGroup().getReplicationTargets().size(), active.size() + initializing.size()); + Set replicationTargets = tracker.getReplicationGroup() + .getReplicationTargets() + .stream() + .map(ShardRouting::allocationId) + .collect(Collectors.toSet()); + assertTrue(replicationTargets.containsAll(initializing)); + assertTrue(replicationTargets.containsAll(active)); + allocations.keySet().forEach(aId -> updateLocalCheckpoint(tracker, aId.getId(), allocations.get(aId))); + + assertEquals(tracker.getGlobalCheckpoint(), primaryLocalCheckpoint); + + // increment checkpoints + active.forEach(aId -> allocations.put(aId, allocations.get(aId) + 1 + randomInt(4))); + initializing.forEach(aId -> allocations.put(aId, allocations.get(aId) + 1 + randomInt(4))); + allocations.keySet().forEach(aId -> updateLocalCheckpoint(tracker, aId.getId(), allocations.get(aId))); + + final long minLocalCheckpointAfterUpdates = allocations.values().stream().min(Long::compareTo).orElse(UNASSIGNED_SEQ_NO); + + // now insert an unknown active/insync id , the checkpoint shouldn't change but a refresh should be requested. + final AllocationId extraId = AllocationId.newInitializing(); + + // first check that adding it without the cluster-manager blessing doesn't change anything. + updateLocalCheckpoint(tracker, extraId.getId(), minLocalCheckpointAfterUpdates + 1 + randomInt(4)); + assertNull(tracker.checkpoints.get(extraId.getId())); + expectThrows(IllegalStateException.class, () -> tracker.initiateTracking(extraId.getId())); + + Set newInitializing = new HashSet<>(initializing); + newInitializing.add(extraId); + tracker.updateFromClusterManager(initialClusterStateVersion + 1, ids(active), routingTable(newInitializing, primaryId)); + + tracker.initiateTracking(extraId.getId()); + + // now notify for the new id + if (randomBoolean()) { + updateLocalCheckpoint(tracker, extraId.getId(), minLocalCheckpointAfterUpdates + 1 + randomInt(4)); + markAsTrackingAndInSyncQuietly(tracker, extraId.getId(), randomInt((int) minLocalCheckpointAfterUpdates), false); + } else { + markAsTrackingAndInSyncQuietly(tracker, extraId.getId(), minLocalCheckpointAfterUpdates + 1 + randomInt(4), false); + } + } + + /** + * This test checks that updateGlobalCheckpointOnReplica with remote translog does not violate any of the invariants + */ + public void testUpdateGlobalCheckpointOnReplicaWithRemoteTranslogEnabled() { + final AllocationId active = AllocationId.newInitializing(); + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(active, settings); + final long globalCheckpoint = randomLongBetween(NO_OPS_PERFORMED, Long.MAX_VALUE - 1); + tracker.updateGlobalCheckpointOnReplica(globalCheckpoint, "test"); + assertEquals(updatedGlobalCheckpoint.get(), globalCheckpoint); + final long nonUpdate = randomLongBetween(NO_OPS_PERFORMED, globalCheckpoint); + updatedGlobalCheckpoint.set(UNASSIGNED_SEQ_NO); + tracker.updateGlobalCheckpointOnReplica(nonUpdate, "test"); + assertEquals(updatedGlobalCheckpoint.get(), UNASSIGNED_SEQ_NO); + final long update = randomLongBetween(globalCheckpoint, Long.MAX_VALUE); + tracker.updateGlobalCheckpointOnReplica(update, "test"); + assertEquals(updatedGlobalCheckpoint.get(), update); + } + + public void testMarkAllocationIdAsInSyncWithRemoteTranslogEnabled() throws Exception { + final long initialClusterStateVersion = randomNonNegativeLong(); + Map activeWithCheckpoints = randomAllocationsWithLocalCheckpoints(1, 1); + Set active = new HashSet<>(activeWithCheckpoints.keySet()); + Map initializingWithCheckpoints = randomAllocationsWithLocalCheckpoints(1, 1); + Set initializing = new HashSet<>(initializingWithCheckpoints.keySet()); + final AllocationId primaryId = active.iterator().next(); + final AllocationId replicaId = initializing.iterator().next(); + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(primaryId, settings); + tracker.updateFromClusterManager(initialClusterStateVersion, ids(active), routingTable(initializing, primaryId)); + final long localCheckpoint = randomLongBetween(0, Long.MAX_VALUE - 1); + tracker.activatePrimaryMode(localCheckpoint); + tracker.initiateTracking(replicaId.getId()); + tracker.markAllocationIdAsInSync(replicaId.getId(), randomLongBetween(NO_OPS_PERFORMED, localCheckpoint - 1)); + assertFalse(tracker.pendingInSync()); + final long updatedLocalCheckpoint = randomLongBetween(1 + localCheckpoint, Long.MAX_VALUE); + updatedGlobalCheckpoint.set(UNASSIGNED_SEQ_NO); + tracker.updateLocalCheckpoint(primaryId.getId(), updatedLocalCheckpoint); + assertEquals(updatedGlobalCheckpoint.get(), updatedLocalCheckpoint); + tracker.updateLocalCheckpoint(replicaId.getId(), localCheckpoint); + assertEquals(updatedGlobalCheckpoint.get(), updatedLocalCheckpoint); + tracker.markAllocationIdAsInSync(replicaId.getId(), updatedLocalCheckpoint); + assertEquals(updatedGlobalCheckpoint.get(), updatedLocalCheckpoint); + } + + public void testMissingActiveIdsDoesNotPreventAdvanceWithRemoteTranslogEnabled() { + final Map active = randomAllocationsWithLocalCheckpoints(2, 5); + final Map initializing = randomAllocationsWithLocalCheckpoints(0, 5); + final Map assigned = new HashMap<>(); + assigned.putAll(active); + assigned.putAll(initializing); + AllocationId primaryId = active.keySet().iterator().next(); + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(primaryId, settings); + tracker.updateFromClusterManager(randomNonNegativeLong(), ids(active.keySet()), routingTable(initializing.keySet(), primaryId)); + tracker.activatePrimaryMode(NO_OPS_PERFORMED); + List initializingRandomSubset = randomSubsetOf(initializing.keySet()); + initializingRandomSubset.forEach(k -> markAsTrackingAndInSyncQuietly(tracker, k.getId(), NO_OPS_PERFORMED)); + final AllocationId missingActiveID = randomFrom(active.keySet()); + assigned.entrySet() + .stream() + .filter(e -> !e.getKey().equals(missingActiveID)) + .forEach(e -> updateLocalCheckpoint(tracker, e.getKey().getId(), e.getValue())); + long primaryLocalCheckpoint = active.get(primaryId); + + assertEquals(1 + initializingRandomSubset.size(), tracker.getReplicationGroup().getReplicationTargets().size()); + if (missingActiveID.equals(primaryId) == false) { + assertEquals(tracker.getGlobalCheckpoint(), primaryLocalCheckpoint); + assertEquals(updatedGlobalCheckpoint.get(), primaryLocalCheckpoint); + } + // now update all knowledge of all shards + assigned.forEach((aid, localCP) -> updateLocalCheckpoint(tracker, aid.getId(), 10 + localCP)); + assertEquals(tracker.getGlobalCheckpoint(), 10 + primaryLocalCheckpoint); + assertEquals(updatedGlobalCheckpoint.get(), 10 + primaryLocalCheckpoint); + } + + public void testMissingInSyncIdsDoesNotPreventAdvanceWithRemoteTranslogEnabled() { + final Map active = randomAllocationsWithLocalCheckpoints(1, 5); + final Map initializing = randomAllocationsWithLocalCheckpoints(2, 5); + logger.info("active: {}, initializing: {}", active, initializing); + + AllocationId primaryId = active.keySet().iterator().next(); + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(primaryId, settings); + tracker.updateFromClusterManager(randomNonNegativeLong(), ids(active.keySet()), routingTable(initializing.keySet(), primaryId)); + tracker.activatePrimaryMode(NO_OPS_PERFORMED); + randomSubsetOf(randomIntBetween(1, initializing.size() - 1), initializing.keySet()).forEach( + aId -> markAsTrackingAndInSyncQuietly(tracker, aId.getId(), NO_OPS_PERFORMED) + ); + long primaryLocalCheckpoint = active.get(primaryId); + + active.forEach((aid, localCP) -> updateLocalCheckpoint(tracker, aid.getId(), localCP)); + + assertEquals(tracker.getGlobalCheckpoint(), primaryLocalCheckpoint); + assertEquals(updatedGlobalCheckpoint.get(), primaryLocalCheckpoint); + + // update again + initializing.forEach((aid, localCP) -> updateLocalCheckpoint(tracker, aid.getId(), localCP)); + assertEquals(tracker.getGlobalCheckpoint(), primaryLocalCheckpoint); + assertEquals(updatedGlobalCheckpoint.get(), primaryLocalCheckpoint); + } + + public void testInSyncIdsAreIgnoredIfNotValidatedByClusterManagerWithRemoteTranslogEnabled() { + final Map active = randomAllocationsWithLocalCheckpoints(1, 5); + final Map initializing = randomAllocationsWithLocalCheckpoints(1, 5); + final Map nonApproved = randomAllocationsWithLocalCheckpoints(1, 5); + final AllocationId primaryId = active.keySet().iterator().next(); + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(primaryId, settings); + tracker.updateFromClusterManager(randomNonNegativeLong(), ids(active.keySet()), routingTable(initializing.keySet(), primaryId)); + tracker.activatePrimaryMode(NO_OPS_PERFORMED); + initializing.keySet().forEach(k -> markAsTrackingAndInSyncQuietly(tracker, k.getId(), NO_OPS_PERFORMED)); + nonApproved.keySet() + .forEach( + k -> expectThrows(IllegalStateException.class, () -> markAsTrackingAndInSyncQuietly(tracker, k.getId(), NO_OPS_PERFORMED)) + ); + + List> allocations = Arrays.asList(active, initializing, nonApproved); + Collections.shuffle(allocations, random()); + allocations.forEach(a -> a.forEach((aid, localCP) -> updateLocalCheckpoint(tracker, aid.getId(), localCP))); + + assertNotEquals(UNASSIGNED_SEQ_NO, tracker.getGlobalCheckpoint()); + } + + public void testInSyncIdsAreRemovedIfNotValidatedByClusterManagerWithRemoteTranslogEnabled() { + final long initialClusterStateVersion = randomNonNegativeLong(); + final Map activeToStay = randomAllocationsWithLocalCheckpoints(1, 5); + final Map initializingToStay = randomAllocationsWithLocalCheckpoints(1, 5); + final Map activeToBeRemoved = randomAllocationsWithLocalCheckpoints(1, 5); + final Map initializingToBeRemoved = randomAllocationsWithLocalCheckpoints(1, 5); + final Set active = Sets.union(activeToStay.keySet(), activeToBeRemoved.keySet()); + final Set initializing = Sets.union(initializingToStay.keySet(), initializingToBeRemoved.keySet()); + final Map allocations = new HashMap<>(); + final AllocationId primaryId = active.iterator().next(); + if (activeToBeRemoved.containsKey(primaryId)) { + activeToStay.put(primaryId, activeToBeRemoved.remove(primaryId)); + } + allocations.putAll(activeToStay); + if (randomBoolean()) { + allocations.putAll(activeToBeRemoved); + } + allocations.putAll(initializingToStay); + if (randomBoolean()) { + allocations.putAll(initializingToBeRemoved); + } + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(primaryId, settings); + tracker.updateFromClusterManager(initialClusterStateVersion, ids(active), routingTable(initializing, primaryId)); + tracker.activatePrimaryMode(NO_OPS_PERFORMED); + if (randomBoolean()) { + initializingToStay.keySet().forEach(k -> markAsTrackingAndInSyncQuietly(tracker, k.getId(), NO_OPS_PERFORMED)); + } else { + initializing.forEach(k -> markAsTrackingAndInSyncQuietly(tracker, k.getId(), NO_OPS_PERFORMED)); + } + if (randomBoolean()) { + allocations.forEach((aid, localCP) -> updateLocalCheckpoint(tracker, aid.getId(), localCP)); + } + + // now remove shards + if (randomBoolean()) { + tracker.updateFromClusterManager( + initialClusterStateVersion + 1, + ids(activeToStay.keySet()), + routingTable(initializingToStay.keySet(), primaryId) + ); + allocations.forEach((aid, ckp) -> updateLocalCheckpoint(tracker, aid.getId(), ckp + 10L)); + } else { + allocations.forEach((aid, ckp) -> updateLocalCheckpoint(tracker, aid.getId(), ckp + 10L)); + tracker.updateFromClusterManager( + initialClusterStateVersion + 2, + ids(activeToStay.keySet()), + routingTable(initializingToStay.keySet(), primaryId) + ); + } + + final long checkpoint = activeToStay.get(primaryId) + 10; + assertEquals(tracker.getGlobalCheckpoint(), checkpoint); + } + + public void testUpdateAllocationIdsFromClusterManagerWithRemoteTranslogEnabled() throws Exception { + final long initialClusterStateVersion = randomNonNegativeLong(); + final int numberOfActiveAllocationsIds = randomIntBetween(2, 16); + final int numberOfInitializingIds = randomIntBetween(2, 16); + final Tuple, Set> activeAndInitializingAllocationIds = randomActiveAndInitializingAllocationIds( + numberOfActiveAllocationsIds, + numberOfInitializingIds + ); + final Set activeAllocationIds = activeAndInitializingAllocationIds.v1(); + final Set initializingIds = activeAndInitializingAllocationIds.v2(); + AllocationId primaryId = activeAllocationIds.iterator().next(); + IndexShardRoutingTable routingTable = routingTable(initializingIds, primaryId); + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(primaryId, settings); + tracker.updateFromClusterManager(initialClusterStateVersion, ids(activeAllocationIds), routingTable); + tracker.activatePrimaryMode(NO_OPS_PERFORMED); + assertThat(tracker.getReplicationGroup().getInSyncAllocationIds(), equalTo(ids(activeAllocationIds))); + assertThat(tracker.getReplicationGroup().getRoutingTable(), equalTo(routingTable)); + + // first we assert that the in-sync and tracking sets are set up correctly + assertTrue(activeAllocationIds.stream().allMatch(a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).inSync)); + assertTrue( + activeAllocationIds.stream() + .filter(a -> a.equals(primaryId) == false) + .allMatch( + a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).getLocalCheckpoint() == SequenceNumbers.UNASSIGNED_SEQ_NO + ) + ); + assertTrue(initializingIds.stream().noneMatch(a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).inSync)); + assertTrue( + initializingIds.stream() + .filter(a -> a.equals(primaryId) == false) + .allMatch( + a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).getLocalCheckpoint() == SequenceNumbers.UNASSIGNED_SEQ_NO + ) + ); + + // now we will remove some allocation IDs from these and ensure that they propagate through + final Set removingActiveAllocationIds = new HashSet<>(randomSubsetOf(activeAllocationIds)); + removingActiveAllocationIds.remove(primaryId); + final Set newActiveAllocationIds = activeAllocationIds.stream() + .filter(a -> !removingActiveAllocationIds.contains(a)) + .collect(Collectors.toSet()); + final List removingInitializingAllocationIds = randomSubsetOf(initializingIds); + final Set newInitializingAllocationIds = initializingIds.stream() + .filter(a -> !removingInitializingAllocationIds.contains(a)) + .collect(Collectors.toSet()); + routingTable = routingTable(newInitializingAllocationIds, primaryId); + tracker.updateFromClusterManager(initialClusterStateVersion + 1, ids(newActiveAllocationIds), routingTable); + assertTrue(newActiveAllocationIds.stream().allMatch(a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).inSync)); + assertTrue(removingActiveAllocationIds.stream().allMatch(a -> tracker.getTrackedLocalCheckpointForShard(a.getId()) == null)); + assertTrue(newInitializingAllocationIds.stream().noneMatch(a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).inSync)); + assertTrue(removingInitializingAllocationIds.stream().allMatch(a -> tracker.getTrackedLocalCheckpointForShard(a.getId()) == null)); + assertThat( + tracker.getReplicationGroup().getInSyncAllocationIds(), + equalTo(ids(Sets.difference(Sets.union(activeAllocationIds, newActiveAllocationIds), removingActiveAllocationIds))) + ); + assertThat(tracker.getReplicationGroup().getRoutingTable(), equalTo(routingTable)); + + /* + * Now we will add an allocation ID to each of active and initializing and ensure they propagate through. Using different lengths + * than we have been using above ensures that we can not collide with a previous allocation ID + */ + newInitializingAllocationIds.add(AllocationId.newInitializing()); + tracker.updateFromClusterManager( + initialClusterStateVersion + 2, + ids(newActiveAllocationIds), + routingTable(newInitializingAllocationIds, primaryId) + ); + assertTrue(newActiveAllocationIds.stream().allMatch(a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).inSync)); + assertTrue( + newActiveAllocationIds.stream() + .filter(a -> a.equals(primaryId) == false) + .allMatch( + a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).getLocalCheckpoint() == SequenceNumbers.UNASSIGNED_SEQ_NO + ) + ); + assertTrue(newInitializingAllocationIds.stream().noneMatch(a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).inSync)); + assertTrue( + newInitializingAllocationIds.stream() + .allMatch( + a -> tracker.getTrackedLocalCheckpointForShard(a.getId()).getLocalCheckpoint() == SequenceNumbers.UNASSIGNED_SEQ_NO + ) + ); + + // the tracking allocation IDs should play no role in determining the global checkpoint + final Map activeLocalCheckpoints = newActiveAllocationIds.stream() + .collect(Collectors.toMap(Function.identity(), a -> randomIntBetween(1, 1024))); + activeLocalCheckpoints.forEach((a, l) -> updateLocalCheckpoint(tracker, a.getId(), l)); + final Map initializingLocalCheckpoints = newInitializingAllocationIds.stream() + .collect(Collectors.toMap(Function.identity(), a -> randomIntBetween(1, 1024))); + initializingLocalCheckpoints.forEach((a, l) -> updateLocalCheckpoint(tracker, a.getId(), l)); + assertTrue( + activeLocalCheckpoints.entrySet() + .stream() + .allMatch(e -> tracker.getTrackedLocalCheckpointForShard(e.getKey().getId()).getLocalCheckpoint() == e.getValue()) + ); + assertTrue( + initializingLocalCheckpoints.entrySet() + .stream() + .allMatch(e -> tracker.getTrackedLocalCheckpointForShard(e.getKey().getId()).getLocalCheckpoint() == e.getValue()) + ); + final long primaryLocalCheckpoint = activeLocalCheckpoints.get(primaryId); + assertThat(tracker.getGlobalCheckpoint(), equalTo(primaryLocalCheckpoint)); + assertThat(updatedGlobalCheckpoint.get(), equalTo(primaryLocalCheckpoint)); + final long minimumInitailizingLocalCheckpoint = (long) initializingLocalCheckpoints.values().stream().min(Integer::compareTo).get(); + + // now we are going to add a new allocation ID and bring it in sync which should move it to the in-sync allocation IDs + final long localCheckpoint = randomIntBetween( + 0, + Math.toIntExact(Math.min(primaryLocalCheckpoint, minimumInitailizingLocalCheckpoint) - 1) + ); + + // using a different length than we have been using above ensures that we can not collide with a previous allocation ID + final AllocationId newSyncingAllocationId = AllocationId.newInitializing(); + newInitializingAllocationIds.add(newSyncingAllocationId); + tracker.updateFromClusterManager( + initialClusterStateVersion + 3, + ids(newActiveAllocationIds), + routingTable(newInitializingAllocationIds, primaryId) + ); + addPeerRecoveryRetentionLease(tracker, newSyncingAllocationId); + final CyclicBarrier barrier = new CyclicBarrier(2); + final Thread thread = new Thread(() -> { + try { + barrier.await(); + tracker.initiateTracking(newSyncingAllocationId.getId()); + tracker.markAllocationIdAsInSync(newSyncingAllocationId.getId(), localCheckpoint); + barrier.await(); + } catch (final BrokenBarrierException | InterruptedException e) { + throw new RuntimeException(e); + } + }); + + thread.start(); + + barrier.await(); + + assertBusy(() -> { + assertFalse(tracker.pendingInSync.contains(newSyncingAllocationId.getId())); + assertTrue(tracker.getTrackedLocalCheckpointForShard(newSyncingAllocationId.getId()).inSync); + }); + + tracker.updateLocalCheckpoint(newSyncingAllocationId.getId(), randomIntBetween(Math.toIntExact(primaryLocalCheckpoint), 1024)); + + barrier.await(); + + assertFalse(tracker.pendingInSync.contains(newSyncingAllocationId.getId())); + assertTrue(tracker.getTrackedLocalCheckpointForShard(newSyncingAllocationId.getId()).inSync); + + /* + * The new in-sync allocation ID is in the in-sync set now yet the cluster-manager does not know this; the allocation ID should still be in + * the in-sync set even if we receive a cluster state update that does not reflect this. + * + */ + tracker.updateFromClusterManager( + initialClusterStateVersion + 4, + ids(newActiveAllocationIds), + routingTable(newInitializingAllocationIds, primaryId) + ); + assertTrue(tracker.getTrackedLocalCheckpointForShard(newSyncingAllocationId.getId()).inSync); + assertFalse(tracker.pendingInSync.contains(newSyncingAllocationId.getId())); + } + + public void testPrimaryContextHandoffWithRemoteTranslogEnabled() throws IOException { + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings); + final ShardId shardId = new ShardId("test", "_na_", 0); + + FakeClusterState clusterState = initialState(); + final AllocationId aId = clusterState.routingTable.primaryShard().allocationId(); + final LongConsumer onUpdate = updatedGlobalCheckpoint -> {}; + final long primaryTerm = randomNonNegativeLong(); + final long globalCheckpoint = UNASSIGNED_SEQ_NO; + final BiConsumer> onNewRetentionLease = (leases, listener) -> {}; + ReplicationTracker oldPrimary = new ReplicationTracker( + shardId, + aId.getId(), + indexSettings, + primaryTerm, + globalCheckpoint, + onUpdate, + () -> 0L, + onNewRetentionLease, + OPS_BASED_RECOVERY_ALWAYS_REASONABLE + ); + ReplicationTracker newPrimary = new ReplicationTracker( + shardId, + aId.getRelocationId(), + indexSettings, + primaryTerm, + globalCheckpoint, + onUpdate, + () -> 0L, + onNewRetentionLease, + OPS_BASED_RECOVERY_ALWAYS_REASONABLE + ); + + Set allocationIds = new HashSet<>(Arrays.asList(oldPrimary.shardAllocationId, newPrimary.shardAllocationId)); + + clusterState.apply(oldPrimary); + clusterState.apply(newPrimary); + + oldPrimary.activatePrimaryMode(randomIntBetween(Math.toIntExact(NO_OPS_PERFORMED), 10)); + addPeerRecoveryRetentionLease(oldPrimary, newPrimary.shardAllocationId); + newPrimary.updateRetentionLeasesOnReplica(oldPrimary.getRetentionLeases()); + + final int numUpdates = randomInt(10); + for (int i = 0; i < numUpdates; i++) { + if (rarely()) { + clusterState = randomUpdateClusterState(allocationIds, clusterState); + clusterState.apply(oldPrimary); + clusterState.apply(newPrimary); + } + if (randomBoolean()) { + randomLocalCheckpointUpdate(oldPrimary); + } + if (randomBoolean()) { + randomMarkInSync(oldPrimary, newPrimary); + } + } + + // simulate transferring the global checkpoint to the new primary after finalizing recovery before the handoff + markAsTrackingAndInSyncQuietly( + oldPrimary, + newPrimary.shardAllocationId, + Math.max(SequenceNumbers.NO_OPS_PERFORMED, oldPrimary.getGlobalCheckpoint() + randomInt(5)) + ); + oldPrimary.updateGlobalCheckpointForShard(newPrimary.shardAllocationId, oldPrimary.getGlobalCheckpoint()); + ReplicationTracker.PrimaryContext primaryContext = oldPrimary.startRelocationHandoff(newPrimary.shardAllocationId); + + if (randomBoolean()) { + // cluster state update after primary context handoff + if (randomBoolean()) { + clusterState = randomUpdateClusterState(allocationIds, clusterState); + clusterState.apply(oldPrimary); + clusterState.apply(newPrimary); + } + + // abort handoff, check that we can continue updates and retry handoff + oldPrimary.abortRelocationHandoff(); + + if (rarely()) { + clusterState = randomUpdateClusterState(allocationIds, clusterState); + clusterState.apply(oldPrimary); + clusterState.apply(newPrimary); + } + if (randomBoolean()) { + randomLocalCheckpointUpdate(oldPrimary); + } + if (randomBoolean()) { + randomMarkInSync(oldPrimary, newPrimary); + } + + // do another handoff + primaryContext = oldPrimary.startRelocationHandoff(newPrimary.shardAllocationId); + } + + // send primary context through the wire + BytesStreamOutput output = new BytesStreamOutput(); + primaryContext.writeTo(output); + StreamInput streamInput = output.bytes().streamInput(); + primaryContext = new ReplicationTracker.PrimaryContext(streamInput); + switch (randomInt(3)) { + case 0: { + // apply cluster state update on old primary while primary context is being transferred + clusterState = randomUpdateClusterState(allocationIds, clusterState); + clusterState.apply(oldPrimary); + // activate new primary + newPrimary.activateWithPrimaryContext(primaryContext); + // apply cluster state update on new primary so that the states on old and new primary are comparable + clusterState.apply(newPrimary); + break; + } + case 1: { + // apply cluster state update on new primary while primary context is being transferred + clusterState = randomUpdateClusterState(allocationIds, clusterState); + clusterState.apply(newPrimary); + // activate new primary + newPrimary.activateWithPrimaryContext(primaryContext); + // apply cluster state update on old primary so that the states on old and new primary are comparable + clusterState.apply(oldPrimary); + break; + } + case 2: { + // apply cluster state update on both copies while primary context is being transferred + clusterState = randomUpdateClusterState(allocationIds, clusterState); + clusterState.apply(oldPrimary); + clusterState.apply(newPrimary); + newPrimary.activateWithPrimaryContext(primaryContext); + break; + } + case 3: { + // no cluster state update + newPrimary.activateWithPrimaryContext(primaryContext); + break; + } + } + + assertTrue(oldPrimary.primaryMode); + assertTrue(newPrimary.primaryMode); + assertThat(newPrimary.appliedClusterStateVersion, equalTo(oldPrimary.appliedClusterStateVersion)); + /* + * We can not assert on shared knowledge of the global checkpoint between the old primary and the new primary as the new primary + * will update its global checkpoint state without the old primary learning of it, and the old primary could have updated its + * global checkpoint state after the primary context was transferred. + */ + Map oldPrimaryCheckpointsCopy = new HashMap<>(oldPrimary.checkpoints); + oldPrimaryCheckpointsCopy.remove(oldPrimary.shardAllocationId); + oldPrimaryCheckpointsCopy.remove(newPrimary.shardAllocationId); + Map newPrimaryCheckpointsCopy = new HashMap<>(newPrimary.checkpoints); + newPrimaryCheckpointsCopy.remove(oldPrimary.shardAllocationId); + newPrimaryCheckpointsCopy.remove(newPrimary.shardAllocationId); + assertThat(newPrimaryCheckpointsCopy, equalTo(oldPrimaryCheckpointsCopy)); + // we can however assert that shared knowledge of the local checkpoint and in-sync status is equal + assertThat( + oldPrimary.checkpoints.get(oldPrimary.shardAllocationId).localCheckpoint, + equalTo(newPrimary.checkpoints.get(oldPrimary.shardAllocationId).localCheckpoint) + ); + assertThat( + oldPrimary.checkpoints.get(newPrimary.shardAllocationId).localCheckpoint, + equalTo(newPrimary.checkpoints.get(newPrimary.shardAllocationId).localCheckpoint) + ); + assertThat( + oldPrimary.checkpoints.get(oldPrimary.shardAllocationId).inSync, + equalTo(newPrimary.checkpoints.get(oldPrimary.shardAllocationId).inSync) + ); + assertThat( + oldPrimary.checkpoints.get(newPrimary.shardAllocationId).inSync, + equalTo(newPrimary.checkpoints.get(newPrimary.shardAllocationId).inSync) + ); + assertThat(newPrimary.getGlobalCheckpoint(), equalTo(oldPrimary.getGlobalCheckpoint())); + assertThat(newPrimary.routingTable, equalTo(oldPrimary.routingTable)); + assertThat(newPrimary.replicationGroup, equalTo(oldPrimary.replicationGroup)); + + assertFalse(oldPrimary.relocated); + oldPrimary.completeRelocationHandoff(); + assertFalse(oldPrimary.primaryMode); + assertTrue(oldPrimary.relocated); + } + + public void testIllegalStateExceptionIfUnknownAllocationIdWithRemoteTranslogEnabled() { + final AllocationId active = AllocationId.newInitializing(); + final AllocationId initializing = AllocationId.newInitializing(); + Settings settings = Settings.builder().put("index.remote_store.translog.enabled", "true").build(); + final ReplicationTracker tracker = newTracker(active, settings); + tracker.updateFromClusterManager( + randomNonNegativeLong(), + Collections.singleton(active.getId()), + routingTable(Collections.singleton(initializing), active) + ); + tracker.activatePrimaryMode(NO_OPS_PERFORMED); + + expectThrows(IllegalStateException.class, () -> tracker.initiateTracking(randomAlphaOfLength(10))); + expectThrows(IllegalStateException.class, () -> tracker.markAllocationIdAsInSync(randomAlphaOfLength(10), randomNonNegativeLong())); + } + } diff --git a/test/framework/src/main/java/org/opensearch/index/replication/OpenSearchIndexLevelReplicationTestCase.java b/test/framework/src/main/java/org/opensearch/index/replication/OpenSearchIndexLevelReplicationTestCase.java index b3f062aef4fbe..92c80ac1799ef 100644 --- a/test/framework/src/main/java/org/opensearch/index/replication/OpenSearchIndexLevelReplicationTestCase.java +++ b/test/framework/src/main/java/org/opensearch/index/replication/OpenSearchIndexLevelReplicationTestCase.java @@ -52,6 +52,7 @@ import org.opensearch.action.support.ActionTestUtils; import org.opensearch.action.support.PlainActionFuture; import org.opensearch.action.support.WriteRequest; +import org.opensearch.action.support.replication.FanoutReplicationProxy; import org.opensearch.action.support.replication.PendingReplicationActions; import org.opensearch.action.support.replication.ReplicatedWriteRequest; import org.opensearch.action.support.replication.ReplicationOperation; @@ -727,7 +728,8 @@ public void execute() { opType, primaryTerm, TimeValue.timeValueMillis(20), - TimeValue.timeValueSeconds(60) + TimeValue.timeValueSeconds(60), + new FanoutReplicationProxy<>() ).execute(); } catch (Exception e) { listener.onFailure(e);