Skip to content

Commit 852a118

Browse files
authored
Move updating connectivity state outside of subchannel lock (#2601)
1 parent 60d9d2c commit 852a118

File tree

2 files changed

+96
-2
lines changed

2 files changed

+96
-2
lines changed

src/Grpc.Net.Client/Balancer/Subchannel.cs

+11-2
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ public void UpdateAddresses(IReadOnlyList<BalancerAddress> addresses)
237237
/// </summary>
238238
public void RequestConnection()
239239
{
240+
var connectionRequested = false;
240241
lock (Lock)
241242
{
242243
switch (_state)
@@ -245,7 +246,8 @@ public void RequestConnection()
245246
SubchannelLog.ConnectionRequested(_logger, Id);
246247

247248
// Only start connecting underlying transport if in an idle state.
248-
UpdateConnectivityState(ConnectivityState.Connecting, "Connection requested.");
249+
// Update connectivity state outside of subchannel lock to avoid deadlock.
250+
connectionRequested = true;
249251
break;
250252
case ConnectivityState.Connecting:
251253
case ConnectivityState.Ready:
@@ -264,6 +266,11 @@ public void RequestConnection()
264266
}
265267
}
266268

269+
if (connectionRequested)
270+
{
271+
UpdateConnectivityState(ConnectivityState.Connecting, "Connection requested.");
272+
}
273+
267274
// Don't capture the current ExecutionContext and its AsyncLocals onto the connect
268275
var restoreFlow = false;
269276
if (!ExecutionContext.IsFlowSuppressed())
@@ -448,6 +455,8 @@ internal bool UpdateConnectivityState(ConnectivityState state, string successDet
448455

449456
internal bool UpdateConnectivityState(ConnectivityState state, Status status)
450457
{
458+
Debug.Assert(!Monitor.IsEntered(Lock), "Ensure the subchannel lock isn't held here. Updating channel state with the subchannel lock can cause a deadlock.");
459+
451460
lock (Lock)
452461
{
453462
// Don't update subchannel state if the state is the same or the subchannel has been shutdown.
@@ -462,7 +471,7 @@ internal bool UpdateConnectivityState(ConnectivityState state, Status status)
462471
}
463472
_state = state;
464473
}
465-
474+
466475
// Notify channel outside of lock to avoid deadlocks.
467476
_manager.OnSubchannelStateChange(this, state, status);
468477
return true;

test/Grpc.Net.Client.Tests/Balancer/ConnectionManagerTests.cs

+85
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#endregion
1818

1919
#if SUPPORT_LOAD_BALANCING
20+
using System.Diagnostics;
2021
using System.Net;
2122
using System.Threading.Channels;
2223
using Greet;
@@ -29,6 +30,7 @@
2930
using Grpc.Tests.Shared;
3031
using Microsoft.Extensions.DependencyInjection;
3132
using Microsoft.Extensions.Logging;
33+
using Microsoft.Extensions.Logging.Abstractions;
3234
using Microsoft.Extensions.Logging.Testing;
3335
using NUnit.Framework;
3436
using ChannelState = Grpc.Net.Client.Balancer.ChannelState;
@@ -535,6 +537,89 @@ public async Task PickAsync_DoesNotDeadlockAfterReconnect_WithZeroAddressResolve
535537
await pickTask.DefaultTimeout();
536538
}
537539

540+
[Test]
541+
public async Task PickAsync_UpdateAddressesWhileRequestingConnection_DoesNotDeadlock()
542+
{
543+
var services = new ServiceCollection();
544+
services.AddNUnitLogger();
545+
546+
var testSink = new TestSink();
547+
var testProvider = new TestLoggerProvider(testSink);
548+
549+
services.AddLogging(b =>
550+
{
551+
b.AddProvider(testProvider);
552+
});
553+
554+
await using var serviceProvider = services.BuildServiceProvider();
555+
var loggerFactory = serviceProvider.GetRequiredService<ILoggerFactory>();
556+
557+
var resolver = new TestResolver(loggerFactory);
558+
resolver.UpdateAddresses(new List<BalancerAddress>
559+
{
560+
new BalancerAddress("localhost", 80)
561+
});
562+
563+
var channelOptions = new GrpcChannelOptions();
564+
565+
var transportFactory = new TestSubchannelTransportFactory();
566+
var clientChannel = CreateConnectionManager(loggerFactory, resolver, transportFactory, new[] { new PickFirstBalancerFactory() });
567+
// Configure balancer similar to how GrpcChannel constructor does it
568+
clientChannel.ConfigureBalancer(c => new ChildHandlerLoadBalancer(
569+
c,
570+
channelOptions.ServiceConfig,
571+
clientChannel));
572+
573+
await clientChannel.ConnectAsync(waitForReady: true, cancellationToken: CancellationToken.None);
574+
575+
transportFactory.Transports.ForEach(t => t.Disconnect());
576+
577+
var requestConnectionSyncPoint = new SyncPoint(runContinuationsAsynchronously: true);
578+
testSink.MessageLogged += (w) =>
579+
{
580+
if (w.EventId.Name == "ConnectionRequested")
581+
{
582+
requestConnectionSyncPoint.WaitToContinue().Wait();
583+
}
584+
};
585+
586+
// Task should pause when requesting connection because of the logger sink.
587+
var pickTask = Task.Run(() => clientChannel.PickAsync(
588+
new PickContext { Request = new HttpRequestMessage() },
589+
waitForReady: true,
590+
CancellationToken.None).AsTask());
591+
592+
// Wait until we're paused on requesting a connection.
593+
await requestConnectionSyncPoint.WaitForSyncPoint().DefaultTimeout();
594+
595+
// Update addresses while requesting a connection.
596+
var updateAddressesTcs = new TaskCompletionSource<object?>(TaskCreationOptions.RunContinuationsAsynchronously);
597+
var updateAddressesTask = Task.Run(() =>
598+
{
599+
updateAddressesTcs.TrySetResult(null);
600+
resolver.UpdateAddresses(new List<BalancerAddress>
601+
{
602+
new BalancerAddress("localhost", 81)
603+
});
604+
});
605+
606+
// There isn't a clean way to wait for UpdateAddresses to be waiting for the subchannel lock.
607+
// Use a long delay to ensure we're waiting for the lock and are in the right state.
608+
await updateAddressesTcs.Task.DefaultTimeout();
609+
await Task.Delay(500);
610+
requestConnectionSyncPoint.Continue();
611+
612+
// Ensure the pick completes without deadlock.
613+
try
614+
{
615+
await pickTask.DefaultTimeout();
616+
}
617+
catch (TimeoutException ex)
618+
{
619+
throw new InvalidOperationException("Likely deadlock when picking subchannel.", ex);
620+
}
621+
}
622+
538623
[Test]
539624
public async Task PickAsync_ExecutionContext_DoesNotCaptureAsyncLocalsInConnect()
540625
{

0 commit comments

Comments
 (0)