Skip to content

Commit fda61d3

Browse files
authored
fix: determine F3 participants relative to current network name (#12597)
* Investigate intermittent F3 itest failures on CI Repeat F3 itests on CI to investigate intermittent failures. * Fix participation lease removal for wrong network When manifest changes, depending on the timing it is possible for newly generated valid leases to get removed if the sign message loop attempts to sign messages that are as a result of progressing previous network. Here is an example scenario in a specific order that was causing itests to fail: * participants get a lease for network A up to instance 5 * network A progresses to instance 6 * manifest changes the network name to B * participants get a new lease for network B up to instance 5 * sign loop receives a message from network A, instance 6 * `getParticipantsByInstance` lazily removes leases since it only checks the instance. * the node ends up with no participants, and stuck. To fix this: 1) check if participants asked for are within the current network, and if not refuse to participate. 2) check network name, as well as instance, to lazily remove expired leases. * Add debug capability to F3 itests to print current progress To aid debugging failing tests add option to print progress of all nodes at every eventual assertion, disabled by default. * Shorten GPBFT settings for a more responsive timing Defaults are based on epoch of 30s and real RTT. Shorten Delta and rebroadcast times. * Remove F3 itest repetitions on CI now that saul goodman See proof of the pudding: * https://github.com/filecoin-project/lotus/actions/runs/11369403828/job/31626763159?pr=12597 * Update the changelog * Address review comments * Remove the sanity check that all nodes use the same initial manifest
1 parent dafc56c commit fda61d3

File tree

5 files changed

+75
-14
lines changed

5 files changed

+75
-14
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
- Fix a bug in the `lotus-shed indexes backfill-events` command that may result in either duplicate events being backfilled where there are existing events (such an operation *should* be idempotent) or events erroneously having duplicate `logIndex` values when queried via ETH APIs. ([filecoin-project/lotus#12567](https://github.com/filecoin-project/lotus/pull/12567))
1515
- Event APIs (Eth events and actor events) should only return reverted events if client queries by specific block hash / tipset. Eth and actor event subscription APIs should always return reverted events to enable accurate observation of real-time changes. ([filecoin-project/lotus#12585](https://github.com/filecoin-project/lotus/pull/12585))
1616
- Add logic to check if the miner's owner address is delegated (f4 address). If it is delegated, the `lotus-shed sectors termination-estimate` command now sends the termination state call using the worker ID. This fix resolves the issue where termination-estimate did not function correctly for miners with delegated owner addresses. ([filecoin-project/lotus#12569](https://github.com/filecoin-project/lotus/pull/12569))
17+
- Fix a bug in F3 participation API where valid leases may get removed due to dynamic manifest update. ([filecoin-project/lotus#12597](https://github.com/filecoin-project/lotus/pull/12597))
1718

1819
## Deps
1920

chain/lf3/f3.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ func (fff *F3) runSigningLoop(ctx context.Context) {
155155
clear(alreadyParticipated)
156156
}
157157

158-
participants := fff.leaser.getParticipantsByInstance(mb.Payload.Instance)
158+
participants := fff.leaser.getParticipantsByInstance(mb.NetworkName, mb.Payload.Instance)
159159
for _, id := range participants {
160160
if _, ok := alreadyParticipated[id]; ok {
161161
continue

chain/lf3/participation_lease.go

+13-3
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,25 @@ func (l *leaser) participate(ticket api.F3ParticipationTicket) (api.F3Participat
112112
return newLease, nil
113113
}
114114

115-
func (l *leaser) getParticipantsByInstance(instance uint64) []uint64 {
115+
func (l *leaser) getParticipantsByInstance(network gpbft.NetworkName, instance uint64) []uint64 {
116116
l.mutex.Lock()
117117
defer l.mutex.Unlock()
118+
currentManifest, _ := l.status()
119+
currentNetwork := currentManifest.NetworkName
120+
if currentNetwork != network {
121+
return nil
122+
}
118123
var participants []uint64
119124
for id, lease := range l.leases {
120-
if instance > lease.ToInstance() {
125+
if currentNetwork != lease.Network {
126+
// Lazily delete any lease that does not belong to network, likely acquired from
127+
// prior manifests.
128+
delete(l.leases, id)
129+
log.Warnf("lost F3 participation lease for miner %d at instance %d due to network mismatch: %s != %s", id, instance, currentNetwork, lease.Network)
130+
} else if instance > lease.ToInstance() {
121131
// Lazily delete the expired leases.
122132
delete(l.leases, id)
123-
log.Warnf("lost F3 participation lease for miner %d", id)
133+
log.Warnf("lost F3 participation lease for miner %d due to instance (%d) > lease to instance (%d)", id, instance, lease.ToInstance())
124134
} else {
125135
participants = append(participants, id)
126136
}

chain/lf3/participation_lease_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,18 @@ func TestLeaser(t *testing.T) {
4242
require.NoError(t, err)
4343

4444
// Both participants should still be valid.
45-
participants := subject.getParticipantsByInstance(11)
45+
participants := subject.getParticipantsByInstance(testManifest.NetworkName, 11)
4646
require.Len(t, participants, 2)
4747
require.Contains(t, participants, uint64(123))
4848
require.Contains(t, participants, uint64(456))
4949

5050
// After instance 16, only participant 456 should be valid.
51-
participants = subject.getParticipantsByInstance(16)
51+
participants = subject.getParticipantsByInstance(testManifest.NetworkName, 16)
5252
require.Len(t, participants, 1)
5353
require.Contains(t, participants, uint64(456))
5454

5555
// After instance 17, no participant must have a lease.
56-
participants = subject.getParticipantsByInstance(17)
56+
participants = subject.getParticipantsByInstance(testManifest.NetworkName, 17)
5757
require.Empty(t, participants)
5858
})
5959
t.Run("expired ticket", func(t *testing.T) {

itests/f3_test.go

+57-7
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package itests
22

33
import (
44
"context"
5+
"sync"
56
"testing"
67
"time"
78

@@ -36,6 +37,7 @@ type testEnv struct {
3637
m *manifest.Manifest
3738
t *testing.T
3839
testCtx context.Context
40+
debug bool
3941
}
4042

4143
// Test that checks that F3 is enabled successfully,
@@ -194,6 +196,24 @@ func (e *testEnv) waitFor(f func(n *kit.TestFullNode) bool, timeout time.Duratio
194196
e.t.Helper()
195197
require.Eventually(e.t, func() bool {
196198
e.t.Helper()
199+
defer func() {
200+
if e.debug {
201+
var wg sync.WaitGroup
202+
printProgress := func(index int, n *kit.TestFullNode) {
203+
defer wg.Done()
204+
if progress, err := n.F3GetProgress(e.testCtx); err != nil {
205+
e.t.Logf("Node #%d progress: err: %v", index, err)
206+
} else {
207+
e.t.Logf("Node #%d progress: %v", index, progress)
208+
}
209+
}
210+
for i, n := range e.minerFullNodes {
211+
wg.Add(1)
212+
go printProgress(i, n)
213+
}
214+
wg.Wait()
215+
}
216+
}()
197217
for _, n := range e.minerFullNodes {
198218
if !f(n) {
199219
return false
@@ -209,8 +229,42 @@ func (e *testEnv) waitFor(f func(n *kit.TestFullNode) bool, timeout time.Duratio
209229
// and the second full-node is an observer that is not directly connected to
210230
// a miner. The last return value is the manifest sender for the network.
211231
func setup(t *testing.T, blocktime time.Duration) *testEnv {
212-
manif := lf3.NewManifest(BaseNetworkName+"/1", DefaultFinality, DefaultBootstrapEpoch, blocktime, cid.Undef)
213-
return setupWithStaticManifest(t, manif, false)
232+
return setupWithStaticManifest(t, newTestManifest(blocktime), false)
233+
}
234+
235+
func newTestManifest(blocktime time.Duration) *manifest.Manifest {
236+
return &manifest.Manifest{
237+
ProtocolVersion: manifest.VersionCapability,
238+
BootstrapEpoch: DefaultBootstrapEpoch,
239+
NetworkName: BaseNetworkName + "/1",
240+
InitialPowerTable: cid.Undef,
241+
CommitteeLookback: manifest.DefaultCommitteeLookback,
242+
CatchUpAlignment: blocktime / 2,
243+
Gpbft: manifest.GpbftConfig{
244+
// Use smaller time intervals for more responsive test progress/assertion.
245+
Delta: 250 * time.Millisecond,
246+
DeltaBackOffExponent: 1.3,
247+
MaxLookaheadRounds: 5,
248+
RebroadcastBackoffBase: 500 * time.Millisecond,
249+
RebroadcastBackoffSpread: 0.1,
250+
RebroadcastBackoffExponent: 1.3,
251+
RebroadcastBackoffMax: 1 * time.Second,
252+
},
253+
EC: manifest.EcConfig{
254+
Period: blocktime,
255+
Finality: DefaultFinality,
256+
DelayMultiplier: manifest.DefaultEcConfig.DelayMultiplier,
257+
BaseDecisionBackoffTable: manifest.DefaultEcConfig.BaseDecisionBackoffTable,
258+
HeadLookback: 0,
259+
Finalize: true,
260+
},
261+
CertificateExchange: manifest.CxConfig{
262+
ClientRequestTimeout: manifest.DefaultCxConfig.ClientRequestTimeout,
263+
ServerRequestTimeout: manifest.DefaultCxConfig.ServerRequestTimeout,
264+
MinimumPollInterval: blocktime,
265+
MaximumPollInterval: 4 * blocktime,
266+
},
267+
}
214268
}
215269

216270
func setupWithStaticManifest(t *testing.T, manif *manifest.Manifest, testBootstrap bool) *testEnv {
@@ -262,10 +316,7 @@ func setupWithStaticManifest(t *testing.T, manif *manifest.Manifest, testBootstr
262316
cancel()
263317
}
264318

265-
m, err := n1.F3GetManifest(ctx)
266-
require.NoError(t, err)
267-
268-
e := &testEnv{m: m, t: t, testCtx: ctx}
319+
e := &testEnv{m: manif, t: t, testCtx: ctx}
269320
// in case we want to use more full-nodes in the future
270321
e.minerFullNodes = []*kit.TestFullNode{&n1, &n2, &n3}
271322

@@ -275,7 +326,6 @@ func setupWithStaticManifest(t *testing.T, manif *manifest.Manifest, testBootstr
275326
err = n.NetConnect(ctx, e.ms.PeerInfo())
276327
require.NoError(t, err)
277328
}
278-
279329
errgrp.Go(func() error {
280330
defer func() {
281331
require.NoError(t, manifestServerHost.Close())

0 commit comments

Comments
 (0)