Skip to content

Commit eaac059

Browse files
authored
Fix - Under load and during topology changes, thread saturation can occur, causing a lockup (#2139)
* Add endpoint manager test to repro thread lockup fix merge * add explanation and dockerfile * Block endpoint while it disposes instead of holding requests behind a lock. this also allows messages to other endpoints while one is disposing, as well as multiple endpoint disposes at the same time. * Change EndpointManager Stop to StopAsync * ensure the EndpointReader always finishes up the request after sending the DisconnectRequest, so it doesn't time out during kestrel shutdown * increase timeout on a couple tests so they fail less often * increase another timeout on flakey test
1 parent 4bbf51b commit eaac059

File tree

11 files changed

+269
-76
lines changed

11 files changed

+269
-76
lines changed

ProtoActor.sln

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "template", "template", "{08
307307
examples\ClusterK8sGrains\chart\templates\protoactor-k8s-grains-serviceaccount.yaml = examples\ClusterK8sGrains\chart\templates\protoactor-k8s-grains-serviceaccount.yaml
308308
EndProjectSection
309309
EndProject
310+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EndpointManagerTest", "benchmarks\EndpointManagerTest\EndpointManagerTest.csproj", "{B7258689-41D2-4284-AF93-050DD1DFEAC4}"
311+
EndProject
310312
Global
311313
GlobalSection(SolutionConfigurationPlatforms) = preSolution
312314
Debug|Any CPU = Debug|Any CPU
@@ -1481,6 +1483,18 @@ Global
14811483
{B196FBFE-0DAA-4533-9A56-BB5826A57923}.Release|x64.Build.0 = Release|Any CPU
14821484
{B196FBFE-0DAA-4533-9A56-BB5826A57923}.Release|x86.ActiveCfg = Release|Any CPU
14831485
{B196FBFE-0DAA-4533-9A56-BB5826A57923}.Release|x86.Build.0 = Release|Any CPU
1486+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
1487+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Debug|Any CPU.Build.0 = Debug|Any CPU
1488+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Debug|x64.ActiveCfg = Debug|Any CPU
1489+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Debug|x64.Build.0 = Debug|Any CPU
1490+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Debug|x86.ActiveCfg = Debug|Any CPU
1491+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Debug|x86.Build.0 = Debug|Any CPU
1492+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Release|Any CPU.ActiveCfg = Release|Any CPU
1493+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Release|Any CPU.Build.0 = Release|Any CPU
1494+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Release|x64.ActiveCfg = Release|Any CPU
1495+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Release|x64.Build.0 = Release|Any CPU
1496+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Release|x86.ActiveCfg = Release|Any CPU
1497+
{B7258689-41D2-4284-AF93-050DD1DFEAC4}.Release|x86.Build.0 = Release|Any CPU
14841498
EndGlobalSection
14851499
GlobalSection(SolutionProperties) = preSolution
14861500
HideSolutionNode = FALSE
@@ -1616,6 +1630,7 @@ Global
16161630
{B196FBFE-0DAA-4533-9A56-BB5826A57923} = {ADE7A14E-FFE9-4137-AC25-E2F2A82B0A8C}
16171631
{CDCE3D4C-1BDD-460F-93B8-75123A258183} = {ADE7A14E-FFE9-4137-AC25-E2F2A82B0A8C}
16181632
{087E5441-1582-4D55-8233-014C0FB06FF0} = {CDCE3D4C-1BDD-460F-93B8-75123A258183}
1633+
{B7258689-41D2-4284-AF93-050DD1DFEAC4} = {0F3AB331-C042-4371-A2F0-0AFDFA13DC9F}
16191634
EndGlobalSection
16201635
GlobalSection(ExtensibilityGlobals) = postSolution
16211636
SolutionGuid = {CD0D1E44-8118-4682-8793-6B20ABFA824C}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
FROM mcr.microsoft.com/dotnet/aspnet:8.0 AS base
2+
USER $APP_UID
3+
WORKDIR /app
4+
5+
FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
6+
ARG BUILD_CONFIGURATION=Release
7+
WORKDIR /src
8+
COPY ["benchmarks/EndpointManagerTest/EndpointManagerTest.csproj", "benchmarks/EndpointManagerTest/"]
9+
COPY ["src/Proto.Actor/Proto.Actor.csproj", "src/Proto.Actor/"]
10+
COPY ["src/Proto.Remote/Proto.Remote.csproj", "src/Proto.Remote/"]
11+
RUN dotnet restore "benchmarks/EndpointManagerTest/EndpointManagerTest.csproj"
12+
COPY . .
13+
WORKDIR "/src/benchmarks/EndpointManagerTest"
14+
RUN dotnet build "EndpointManagerTest.csproj" -c $BUILD_CONFIGURATION -o /app/build
15+
16+
FROM build AS publish
17+
ARG BUILD_CONFIGURATION=Release
18+
RUN dotnet publish "EndpointManagerTest.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
19+
20+
FROM base AS final
21+
WORKDIR /app
22+
COPY --from=publish /app/publish .
23+
ENTRYPOINT ["dotnet", "EndpointManagerTest.dll"]
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net8.0</TargetFramework>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<Nullable>enable</Nullable>
8+
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
9+
</PropertyGroup>
10+
11+
<ItemGroup>
12+
<ProjectReference Include="..\..\src\Proto.Actor\Proto.Actor.csproj" />
13+
<ProjectReference Include="..\..\src\Proto.Remote\Proto.Remote.csproj" />
14+
</ItemGroup>
15+
16+
<ItemGroup>
17+
<Content Include="..\..\.dockerignore">
18+
<Link>.dockerignore</Link>
19+
</Content>
20+
</ItemGroup>
21+
22+
</Project>
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
using System.Diagnostics;
2+
using Microsoft.Extensions.Logging;
3+
using Proto;
4+
using Proto.Remote;
5+
using Proto.Remote.GrpcNet;
6+
7+
namespace EndpointManagerTest;
8+
9+
// This program tests lockup issues with the EndpointManager.
10+
// TL:DR; This is to demonstrate the issue with the locking and blocking waits in EndpointManager, and to confirm the fix.
11+
//
12+
// This recreates a scenario we were seeing in our production environments.
13+
// What we saw was 30 cluster clients were sending many messages to 2 of the cluster members, who were sending messages to eachother depending
14+
// on actor placement. If something happens and the 2 members had to reboot, they would end up locking up, not being able to do anything.
15+
// This scenario has been recreated more simply here, where you have 2 members sending many messages back and forth, a disconnect comes through
16+
// from a member that recently restarted, and new connections are being opened to other members. Putting all these together, we end up in a situation
17+
// where many threads get stuck at a lock in EndpointManager, while the one thread inside of the lock is waiting for a ServerConnector to stop.
18+
// NOTE: that this can be a bit flakey as we are trying to reproduce a complete thread lockup. So there is a dockerfile to run it in a more consistent
19+
// environment. Using `--cpus="1"` with docker will make it even more consistent, but sometimes it takes a few tries to repro.
20+
// You will know you reproduced it when you stop seeing "This should log every second." every second. you may also see the built in
21+
// "ThreadPool is running hot" log, but the absence of that log is ambiguous, since if it's locked up it won't finish to log how long it took!
22+
// The other indicator is that all the new connections made at the end should be logging terminations and reconnects and quickly give up (since they don't exist),
23+
// but of course that won't be happening when you're locked up. Also seeing any "terminating" messages without a corresponding "terminated" message
24+
// also indicates that you're locked up.
25+
class Program
26+
{
27+
private static async Task Main()
28+
{
29+
Log.SetLoggerFactory(
30+
LoggerFactory.Create(
31+
c =>
32+
c.SetMinimumLevel(LogLevel.Debug)
33+
.AddFilter("Microsoft", LogLevel.None)
34+
.AddFilter("Grpc", LogLevel.None)
35+
.AddFilter("Proto.Context.ActorContext", LogLevel.Information)
36+
.AddFilter("Proto.Diagnostics.DiagnosticsStore", LogLevel.Warning)
37+
.AddFilter("Proto.Remote.ServerConnector", LogLevel.Error)
38+
.AddSimpleConsole(o => o.SingleLine = true)
39+
)
40+
);
41+
42+
var logger = Log.CreateLogger("Main");
43+
44+
_ = Task.Factory.StartNew(async () =>
45+
{
46+
while (true)
47+
{
48+
try
49+
{
50+
await Task.Factory.StartNew(async () => { await Task.Yield(); });
51+
}
52+
catch (Exception)
53+
{
54+
}
55+
56+
logger.LogInformation("This should log every second [pending: {pendingWorkItems}].", ThreadPool.PendingWorkItemCount);
57+
await Task.Delay(1000);
58+
}
59+
});
60+
61+
var sys1 = new ActorSystem().WithRemote(GrpcNetRemoteConfig.BindTo("localhost", 12000).WithRemoteKind("noop", Props.FromProducer(() => new NoopActor())));
62+
await sys1.Remote().StartAsync();
63+
64+
var sys2 = new ActorSystem().WithRemote(GrpcNetRemoteConfig.BindTo("localhost", 12001).WithRemoteKind("noop", Props.FromProducer(() => new NoopActor())));
65+
await sys2.Remote().StartAsync();
66+
67+
var echoActorOn2 = (await sys1.Remote().SpawnAsync("localhost:12001", "noop", TimeSpan.FromSeconds(1))).Pid;
68+
_ = Task.Factory.StartNew(async () =>
69+
{
70+
while (true)
71+
{
72+
for (var i = 0; i < 200; i++)
73+
{
74+
_ = sys1.Root.RequestAsync<Touched>(echoActorOn2, new Touch());
75+
}
76+
await Task.Yield();
77+
}
78+
});
79+
80+
var echoActorOn1 = (await sys2.Remote().SpawnAsync("localhost:12000", "noop", TimeSpan.FromSeconds(1))).Pid;
81+
_ = Task.Factory.StartNew(async () =>
82+
{
83+
while (true)
84+
{
85+
for (var i = 0; i < 200; i++)
86+
{
87+
_ = sys2.Root.RequestAsync<Touched>(echoActorOn1, new Touch());
88+
}
89+
await Task.Yield();
90+
}
91+
});
92+
93+
await Task.Delay(3000);
94+
95+
sys1.EventStream.Publish(new EndpointTerminatedEvent(false, "localhost:12001", null));
96+
97+
var port = 12002;
98+
for (var i = 12002; i < 12032; i++)
99+
{
100+
//logger.LogInformation("Touching {i}", i);
101+
_ = sys1.Root.RequestAsync<Touched>(new PID($"localhost:{i}", "$1"), new Touch());
102+
}
103+
104+
while (true)
105+
{
106+
//logger.LogInformation("End");
107+
await Task.Delay(1000);
108+
}
109+
}
110+
}
111+
112+
public class NoopActor : IActor
113+
{
114+
public async Task ReceiveAsync(IContext context)
115+
{
116+
}
117+
}

src/Proto.Remote/Endpoints/EndpointManager.cs

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ public EndpointManager(ActorSystem system, RemoteConfigBase remoteConfig, IChann
5353

5454
public void Start() => SpawnActivator();
5555

56-
public void Stop()
56+
public async Task StopAsync()
5757
{
5858
lock (_synLock)
59-
{
59+
{
6060
if (CancellationToken.IsCancellationRequested)
6161
{
6262
return;
@@ -67,69 +67,72 @@ public void Stop()
6767
_system.EventStream.Unsubscribe(_endpointTerminatedEvnSub);
6868

6969
_cancellationTokenSource.Cancel();
70+
}
71+
72+
// release the lock while we dispose, other threads will see the cancellation token and return blocked endpoint.
73+
foreach (var endpoint in _serverEndpoints.Values)
74+
{
75+
await endpoint.DisposeAsync().ConfigureAwait(false);
76+
}
7077

71-
foreach (var endpoint in _serverEndpoints.Values)
72-
{
73-
endpoint.DisposeAsync().GetAwaiter().GetResult();
74-
}
75-
76-
foreach (var endpoint in _clientEndpoints.Values)
77-
{
78-
endpoint.DisposeAsync().GetAwaiter().GetResult();
79-
}
78+
foreach (var endpoint in _clientEndpoints.Values)
79+
{
80+
await endpoint.DisposeAsync().ConfigureAwait(false);
81+
}
8082

81-
_serverEndpoints.Clear();
82-
_clientEndpoints.Clear();
83+
_serverEndpoints.Clear();
84+
_clientEndpoints.Clear();
8385

84-
StopActivator();
86+
StopActivator();
8587

86-
Logger.LogDebug("[{SystemAddress}] Stopped", _system.Address);
87-
}
88+
Logger.LogDebug("[{SystemAddress}] Stopped", _system.Address);
8889
}
8990

90-
private void OnEndpointTerminated(EndpointTerminatedEvent evt)
91+
private async Task OnEndpointTerminated(EndpointTerminatedEvent evt)
9192
{
9293
if (Logger.IsEnabled(LogLevel.Debug))
9394
{
9495
Logger.LogDebug("[{SystemAddress}] Endpoint {Address} terminating", _system.Address,
9596
evt.Address ?? evt.ActorSystemId);
9697
}
9798

99+
Action? unblock = null;
100+
IEndpoint? endpoint = null;
98101
lock (_synLock)
99102
{
100-
if (evt.Address is not null && _serverEndpoints.TryRemove(evt.Address, out var endpoint))
103+
if (_cancellationTokenSource.IsCancellationRequested)
101104
{
102-
endpoint.DisposeAsync().GetAwaiter().GetResult();
103-
104-
if (evt.ShouldBlock && _remoteConfig.WaitAfterEndpointTerminationTimeSpan.HasValue &&
105-
_blockedAddresses.TryAdd(evt.Address, DateTime.UtcNow))
106-
{
107-
_ = SafeTask.Run(async () =>
108-
{
109-
await Task.Delay(_remoteConfig.WaitAfterEndpointTerminationTimeSpan.Value)
110-
.ConfigureAwait(false);
111-
112-
_blockedAddresses.TryRemove(evt.Address, out _);
113-
});
114-
}
105+
return;
106+
}
107+
108+
if (evt.Address is not null && _serverEndpoints.TryRemove(evt.Address, out endpoint))
109+
{
110+
_blockedAddresses.TryAdd(evt.Address, DateTime.UtcNow);
111+
unblock = () => _blockedAddresses.TryRemove(evt.Address, out _);
115112
}
116113

117114
if (evt.ActorSystemId is not null && _clientEndpoints.TryRemove(evt.ActorSystemId, out endpoint))
118115
{
119-
endpoint.DisposeAsync().GetAwaiter().GetResult();
120-
121-
if (evt.ShouldBlock && _remoteConfig.WaitAfterEndpointTerminationTimeSpan.HasValue &&
122-
_blockedClientSystemIds.TryAdd(evt.ActorSystemId, DateTime.UtcNow))
116+
_blockedClientSystemIds.TryAdd(evt.ActorSystemId, DateTime.UtcNow);
117+
unblock = () => _blockedClientSystemIds.TryRemove(evt.ActorSystemId, out _);
118+
}
119+
}
120+
121+
if (endpoint != null)
122+
{
123+
// leave the lock to dispose the endpoint, so that requests can't build up behind the lock
124+
// the address will always be blocked while we dispose, at a minimum
125+
await endpoint.DisposeAsync().ConfigureAwait(false);
126+
if (evt.ShouldBlock && _remoteConfig.WaitAfterEndpointTerminationTimeSpan.HasValue)
127+
{
128+
await Task.Delay(_remoteConfig.WaitAfterEndpointTerminationTimeSpan.Value, CancellationToken).ConfigureAwait(false);
129+
if (_cancellationTokenSource.IsCancellationRequested)
123130
{
124-
_ = SafeTask.Run(async () =>
125-
{
126-
await Task.Delay(_remoteConfig.WaitAfterEndpointTerminationTimeSpan.Value)
127-
.ConfigureAwait(false);
128-
129-
_blockedClientSystemIds.TryRemove(evt.ActorSystemId, out _);
130-
});
131+
return;
131132
}
132133
}
134+
135+
unblock?.Invoke();
133136
}
134137

135138
Logger.LogDebug("[{SystemAddress}] Endpoint {Address} terminated", _system.Address,
@@ -157,11 +160,16 @@ internal IEndpoint GetOrAddServerEndpoint(string? address)
157160

158161
lock (_synLock)
159162
{
163+
if (_cancellationTokenSource.IsCancellationRequested || _blockedAddresses.ContainsKey(address))
164+
{
165+
return _blockedEndpoint;
166+
}
167+
160168
if (_serverEndpoints.TryGetValue(address, out endpoint))
161169
{
162170
return endpoint;
163171
}
164-
172+
165173
if (_system.Address.StartsWith(ActorSystem.Client, StringComparison.Ordinal))
166174
{
167175
if (Logger.IsEnabled(LogLevel.Debug))
@@ -212,6 +220,11 @@ internal IEndpoint GetOrAddClientEndpoint(string systemId)
212220

213221
lock (_synLock)
214222
{
223+
if (_cancellationTokenSource.IsCancellationRequested || _blockedClientSystemIds.ContainsKey(systemId))
224+
{
225+
return _blockedEndpoint;
226+
}
227+
215228
if (_clientEndpoints.TryGetValue(systemId, out endpoint))
216229
{
217230
return endpoint;

src/Proto.Remote/Endpoints/EndpointReader.cs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ ServerCallContext context
4242
throw new RpcException(Status.DefaultCancelled, "Suspended");
4343
}
4444

45+
var cancellationTokenSource = new CancellationTokenSource();
46+
4547
async void Disconnect()
4648
{
4749
try
@@ -60,6 +62,13 @@ async void Disconnect()
6062
Logger.LogWarning("[EndpointReader][{SystemAddress}] Failed to write disconnect message to the stream",
6163
_system.Address);
6264
}
65+
finally
66+
{
67+
// When we disconnect, cancel the token, so the reader and writer both stop, and this method returns,
68+
// so that the stream actually closes. Without this, when kestrel begins shutdown, it's possible the
69+
// connection will stay open until the kestrel shutdown timeout is reached.
70+
cancellationTokenSource.Cancel();
71+
}
6372
}
6473

6574
await using (_endpointManager.CancellationToken.Register(Disconnect).ConfigureAwait(false))
@@ -81,8 +90,6 @@ async void Disconnect()
8190

8291
var connectRequest = requestStream.Current.ConnectRequest;
8392

84-
var cancellationTokenSource = new CancellationTokenSource();
85-
8693
switch (connectRequest.ConnectionTypeCase)
8794
{
8895
case ConnectRequest.ConnectionTypeOneofCase.ClientConnection:
@@ -205,7 +212,7 @@ private async Task RunReader(IAsyncStreamReader<RemoteMessage> requestStream, st
205212
{
206213
try
207214
{
208-
while (await requestStream.MoveNext(CancellationToken.None).ConfigureAwait(false))
215+
while (await requestStream.MoveNext(cancellationTokenSource.Token).ConfigureAwait(false))
209216
{
210217
var currentMessage = requestStream.Current;
211218

0 commit comments

Comments
 (0)