From 3b16aae947e30e5fd8346acb99b6b871c005a00c Mon Sep 17 00:00:00 2001 From: Max Neverov Date: Mon, 15 Apr 2024 15:56:59 +0200 Subject: [PATCH 1/4] Fix remove member failed. Signed-off-by: Max Neverov --- server/etcdserver/server.go | 9 ++++----- server/etcdserver/util.go | 2 +- tests/common/member_test.go | 3 ++- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/server/etcdserver/server.go b/server/etcdserver/server.go index 541cf797c64..2f88bdec30e 100644 --- a/server/etcdserver/server.go +++ b/server/etcdserver/server.go @@ -86,7 +86,7 @@ const ( StoreKeysPrefix = "/1" // HealthInterval is the minimum time the cluster should be healthy - // before accepting add member requests. + // before accepting add and delete member requests. HealthInterval = 5 * time.Second purgeFileInterval = 30 * time.Second @@ -1592,14 +1592,13 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error { } // protect quorum if some members are down - m := s.cluster.VotingMembers() - active := numConnectedSince(s.r.transport, time.Now().Add(-HealthInterval), s.MemberID(), m) - if (active - 1) < 1+((len(m)-1)/2) { + since := time.Now().Add(-HealthInterval) + if !isConnectedToQuorumSince(s.r.transport, since, s.MemberID(), s.cluster.Members()) { lg.Warn( "rejecting member remove request; local member has not been connected to all peers, reconfigure breaks active quorum", zap.String("local-member-id", s.MemberID().String()), zap.String("requested-member-remove", id.String()), - zap.Int("active-peers", active), + zap.Int("active-peers", numConnectedSince(s.r.transport, since, s.MemberID(), s.cluster.Members())), zap.Error(errors.ErrUnhealthy), ) return errors.ErrUnhealthy diff --git a/server/etcdserver/util.go b/server/etcdserver/util.go index fbba5491b07..67ee3facc92 100644 --- a/server/etcdserver/util.go +++ b/server/etcdserver/util.go @@ -33,7 +33,7 @@ func isConnectedToQuorumSince(transport rafthttp.Transporter, since time.Time, s // remote member since the given time. func isConnectedSince(transport rafthttp.Transporter, since time.Time, remote types.ID) bool { t := transport.ActiveSince(remote) - return !t.IsZero() && t.Before(since) + return !t.IsZero() && !t.After(since) } // isConnectedFullySince checks whether the local member is connected to all diff --git a/tests/common/member_test.go b/tests/common/member_test.go index 1f2687c1371..a6cb0bd161a 100644 --- a/tests/common/member_test.go +++ b/tests/common/member_test.go @@ -207,7 +207,8 @@ func TestMemberRemove(t *testing.T) { testutils.ExecuteUntil(ctx, t, func() { if quorumTc.waitForQuorum { - time.Sleep(etcdserver.HealthInterval) + // wait for health interval + leader election + time.Sleep(etcdserver.HealthInterval + 2*time.Second) } memberID, clusterID := memberToRemove(ctx, t, cc, c.ClusterSize) From c64c996c03575c1c571430d51699690c88015bf8 Mon Sep 17 00:00:00 2001 From: Max Neverov Date: Tue, 16 Apr 2024 08:28:43 +0200 Subject: [PATCH 2/4] Revert quorum calculation: `(active - 1) < 1+((len(m)-1)/2)` calculates quorum after a member is deleted. Signed-off-by: Max Neverov --- server/etcdserver/server.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/server/etcdserver/server.go b/server/etcdserver/server.go index 2f88bdec30e..2eebb421de1 100644 --- a/server/etcdserver/server.go +++ b/server/etcdserver/server.go @@ -1592,13 +1592,14 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error { } // protect quorum if some members are down - since := time.Now().Add(-HealthInterval) - if !isConnectedToQuorumSince(s.r.transport, since, s.MemberID(), s.cluster.Members()) { + m := s.cluster.VotingMembers() + active := numConnectedSince(s.r.transport, time.Now().Add(-HealthInterval), s.MemberID(), m) + if (active - 1) < 1+((len(m)-1)/2) { lg.Warn( "rejecting member remove request; local member has not been connected to all peers, reconfigure breaks active quorum", zap.String("local-member-id", s.MemberID().String()), zap.String("requested-member-remove", id.String()), - zap.Int("active-peers", numConnectedSince(s.r.transport, since, s.MemberID(), s.cluster.Members())), + zap.Int("active-peers", active), zap.Error(errors.ErrUnhealthy), ) return errors.ErrUnhealthy From c2982e15f36e97879e523714ec3d1fcec12063f3 Mon Sep 17 00:00:00 2001 From: Max Neverov Date: Thu, 18 Apr 2024 11:26:28 +0200 Subject: [PATCH 3/4] Revert checking connected since inclusively. Signed-off-by: Max Neverov --- server/etcdserver/util.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/etcdserver/util.go b/server/etcdserver/util.go index 67ee3facc92..fbba5491b07 100644 --- a/server/etcdserver/util.go +++ b/server/etcdserver/util.go @@ -33,7 +33,7 @@ func isConnectedToQuorumSince(transport rafthttp.Transporter, since time.Time, s // remote member since the given time. func isConnectedSince(transport rafthttp.Transporter, since time.Time, remote types.ID) bool { t := transport.ActiveSince(remote) - return !t.IsZero() && !t.After(since) + return !t.IsZero() && t.Before(since) } // isConnectedFullySince checks whether the local member is connected to all From 6b0b4ce2b306ebec91a8aac99b7ab36720740e07 Mon Sep 17 00:00:00 2001 From: Max Neverov Date: Thu, 18 Apr 2024 14:47:11 +0200 Subject: [PATCH 4/4] Increase test timeout (twice of sleep interval). Signed-off-by: Max Neverov --- tests/common/member_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common/member_test.go b/tests/common/member_test.go index a6cb0bd161a..2d1039faba3 100644 --- a/tests/common/member_test.go +++ b/tests/common/member_test.go @@ -196,7 +196,7 @@ func TestMemberRemove(t *testing.T) { continue } t.Run(quorumTc.name+"/"+clusterTc.name, func(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 14*time.Second) defer cancel() c := clusterTc.config c.StrictReconfigCheck = quorumTc.strictReconfigCheck