Skip to content

Commit 244bd9b

Browse files
committed
shutdown existing socket when eaddrinuse hit during socket process restart
1 parent 5698251 commit 244bd9b

File tree

1 file changed

+25
-42
lines changed

1 file changed

+25
-42
lines changed

src/grpcbox_socket.erl

Lines changed: 25 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -16,70 +16,52 @@
1616
start_link(Pool, ListenOpts, AcceptorOpts) ->
1717
gen_server:start_link(?MODULE, [Pool, ListenOpts, AcceptorOpts], []).
1818

19-
2019
%% gen_server api
2120

2221
init([Pool, ListenOpts, PoolOpts]) ->
23-
%% Trapping exit so can close socket in terminate/2
24-
_ = process_flag(trap_exit, true),
2522
Port = maps:get(port, ListenOpts, 8080),
2623
IPAddress = maps:get(ip, ListenOpts, {0, 0, 0, 0}),
2724
AcceptorPoolSize = maps:get(size, PoolOpts, 10),
2825
SocketOpts = maps:get(socket_options, ListenOpts, [{reuseaddr, true},
2926
{nodelay, true},
27+
{reuseaddr, true},
3028
{backlog, 32768},
3129
{keepalive, true}]),
30+
%% Trapping exit so can close socket in terminate/2
31+
_ = process_flag(trap_exit, true),
3232
Opts = [{active, false}, {mode, binary}, {packet, raw}, {ip, IPAddress} | SocketOpts],
3333
case gen_tcp:listen(Port, Opts) of
3434
{ok, Socket} ->
3535
%% acceptor could close the socket if there is a problem
3636
MRef = monitor(port, Socket),
37-
{ok, _} = grpcbox_pool:accept_socket(Pool, Socket, AcceptorPoolSize),
37+
grpcbox_pool:accept_socket(Pool, Socket, AcceptorPoolSize),
3838
{ok, {Socket, MRef}};
39-
{error, eaddrinuse} ->
39+
{error, eaddrinuse} = Error ->
4040
%% our desired port is already in use
41-
%% its likely this grpcbox_socket server is restarting
41+
%% its likely this grpcbox_socket server has been killed ( for reason unknown ) and is restarting
4242
%% previously it would have bound to the port before passing control to our acceptor pool
43+
%% the socket remains open
4344
%% in the restart scenario, the socket process would attempt to bind again
4445
%% to the port and then stop, the sup would keep restarting it
4546
%% and we would end up breaching the restart strategy of the parent sup
4647
%% eventually taking down the entire tree
4748
%% result of which is we have no active listener and grpcbox is effectively down
48-
%% so now if we hit eaddrinuse, we check if our acceptor pool is already the
49-
%% controlling process, if so we reuse the port from its state and
50-
%% allow grpcbox_socket to start cleanly
49+
%% so now if we hit eaddrinuse, we check if our acceptor pool using it
50+
%% if so we close the port here and stop this process
51+
%% NOTE: issuing stop in init wont trigger terminate and so cant rely on
52+
%% the socket being closed there
53+
%% This allows the sup to restart things cleanly
54+
%% We could try to reuse the exising port rather than closing it
55+
%% but side effects were encountered there, so deliberately avoiding
5156

5257
%% NOTE: acceptor_pool has a grace period for connections before it terminates
5358
%% grpcbox_pool sets this to a default of 5 secs
5459
%% this needs considered when deciding on related supervisor restart strategies
5560
%% AND keep in mind the acceptor pool will continue accepting new connections
5661
%% during this grace period
5762

58-
%% Other possible fixes here include changing the grpcbox_services_sup from its
59-
%% rest_for_one to a one_for_all strategy. This ensures the pool and thus the
60-
%% current controlling process of the socket is terminated
61-
%% and allows things to restart cleanly if grpcbox_socket dies
62-
%% the disadvantage there however is we will drop all existing grpc connections
63-
64-
%% Another possible fix is to play with the restart strategy intensity and periods
65-
%% and ensure the top level sup doesnt get breached but...
66-
%% a requirement will be to ensure the grpcbox_service_sup forces a restart
67-
%% of grpcbox_pool and therefore the acceptor_pool process
68-
%% as only by doing that will be free up the socket and allow grpcbox_socket to rebind
69-
%% thus we end up terminating any existing grpc connections
70-
71-
%% Yet another possible fix is to move the cleanup of closing the socket
72-
%% out of grpcbox_socket's terminate and into acceptor_pool's terminate
73-
%% that however puts two way co-ordination between two distinct libs
74-
%% which is far from ideal and in addition will also result in existing grpc connection
75-
%% being dropped
76-
77-
%% my view is, if at all possible, its better to restart the grpcbox_socket process without
78-
%% impacting existing connections, the fix below allows for that, albeit a lil messy
79-
%% there is most likely a better solution to all of this, TODO: revisit
80-
8163
%% get the current sockets in use by the acceptor pool
82-
%% if one is bound to our target port then reuse
64+
%% if one is bound to our target port then close it
8365
%% need to allow for possibility of multiple services, each with its own socket
8466
%% so we need to identify our interested socket via port number
8567
PoolSockets = grpcbox_pool:pool_sockets(Pool),
@@ -89,15 +71,15 @@ init([Pool, ListenOpts, PoolOpts]) ->
8971
{ok, Socket};
9072
(_, Acc) ->
9173
Acc
92-
end, {error, eaddrinuse}, PoolSockets),
74+
end, socket_not_found, PoolSockets),
9375
case MaybeHaveExistingSocket of
9476
{ok, Socket} ->
95-
MRef = monitor(port, Socket),
96-
{ok, {Socket, MRef}};
97-
{error, Reason} ->
98-
{stop, Reason}
99-
end;
100-
{error, Reason}->
77+
gen_tcp:close(Socket);
78+
socket_not_found ->
79+
noop
80+
end,
81+
Error;
82+
{error, Reason} ->
10183
{stop, Reason}
10284
end.
10385

@@ -115,10 +97,11 @@ handle_info(_, State) ->
11597
code_change(_, State, _) ->
11698
{ok, State}.
11799

118-
terminate(_, {Socket, MRef}) ->
100+
terminate(_Reason, {Socket, MRef}) ->
119101
%% Socket may already be down but need to ensure it is closed to avoid
120102
%% eaddrinuse error on restart
103+
%% this takes care of that, unless of course this process is killed...
121104
case demonitor(MRef, [flush, info]) of
122105
true -> gen_tcp:close(Socket);
123106
false -> ok
124-
end.
107+
end.

0 commit comments

Comments
 (0)