Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check if we can connect to the remote node before doing connect_node #44

Merged
merged 7 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 76 additions & 3 deletions src/cets_ping.erl
Original file line number Diff line number Diff line change
@@ -1,11 +1,60 @@
-module(cets_ping).
-export([ping/1, ping_pairs/1]).
-export([ping/1, ping_pairs/1, pre_connect/1]).

-ifdef(TEST).
-export([net_family/1]).
-endif.

-ignore_xref([pre_connect/1]).

-spec ping(node()) -> pong | pang.
ping(Node) when is_atom(Node) ->
case lists:member(Node, nodes()) of
true ->
pong;
false ->
case can_preconnect_from_all_nodes(Node) of
true ->
connect_ping(Node);
false ->
pang
end
end.

%% Preconnect checks if the remote node could be connected.
%% It is important to check this on all nodes before actually connecting
%% to avoid getting kicked by overlapped nodes protection in the global module.
%% There are two major scenarios for this function:
%% - Node is down and would return pang on all nodes.
%% - Node is up and would return pong on all nodes.
%% These two scenarios should happen during the normal operation,
%% so code is optimized for them.
%% For first scenario we want to check the local node first and do not do RPCs
%% if possible.
%% For second scenario we want to do the check in parallel on all nodes.
%%
%% This function avoids a corner case when node returns pang on some of the nodes
%% (third scenario).
%% This could be because:
%% - netsplit
%% - node is not resolvable on some of the nodes yet
-spec can_preconnect_from_all_nodes(node()) -> boolean().
can_preconnect_from_all_nodes(Node) ->
Nodes = nodes(),
%% pre_connect is safe to run in parallel
%% (it does not actually create a distributed connection)
case pre_connect(Node) of
pang ->
%% Node is probably down, skip multicall
false;
pong ->
Results = erpc:multicall(Nodes, ?MODULE, pre_connect, [Node], 5000),
%% We skip nodes which do not have cets_ping module and return an error
not lists:member({ok, pang}, Results)
end.

-spec pre_connect(node()) -> pong | pang.
pre_connect(Node) when is_atom(Node) ->
%% It is important to understand, that initial setup for dist connections
%% is done by the single net_kernel process.
%% It calls net_kernel:setup, which calls inet_tcp_dist, which calls
Expand All @@ -24,8 +73,13 @@ ping(Node) when is_atom(Node) ->
case Epmd:address_please(Name, Host, net_family()) of
{error, _} ->
pang;
_ ->
connect_ping(Node)
{ok, IP} ->
case can_connect(IP) of
true ->
pong;
false ->
pang
end
end
end.

Expand Down Expand Up @@ -77,3 +131,22 @@ ping_pairs_stop_on_pang([]) ->

fail_pairs(Pairs, Reason) ->
[{Node1, Node2, Reason} || {Node1, Node2} <- Pairs].

-spec can_connect(inet:ip_address()) -> boolean().
can_connect(IP) ->
case gen_tcp:connect(IP, get_epmd_port(), [], 5000) of
{ok, Socket} ->
gen_tcp:close(Socket),
true;
_ ->
false
end.

-spec get_epmd_port() -> inet:port_number().
get_epmd_port() ->
case init:get_argument(epmd_port) of
{ok, [[PortStr | _] | _]} when is_list(PortStr) ->
list_to_integer(PortStr);
error ->
4369
end.
37 changes: 36 additions & 1 deletion test/cets_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ seq_cases() ->
disco_node_start_timestamp_is_updated_after_node_restarts,
disco_nodeup_triggers_check_and_get_nodes,
ping_pairs_returns_pongs,
ping_pairs_returns_earlier
ping_pairs_returns_earlier,
pre_connect_fails_on_our_node,
pre_connect_fails_on_one_of_the_nodes
].

cets_seq_no_log_cases() ->
Expand Down Expand Up @@ -2483,6 +2485,30 @@ format_data_does_not_return_table_duplicates(Config) ->
cets_ping_non_existing_node(_Config) ->
pang = cets_ping:ping('mongooseim@non_existing_host').

pre_connect_fails_on_our_node(_Config) ->
mock_epmd(),
%% We would fail to connect to the remote EPMD but we would get an IP
pang = cets_ping:ping('mongooseim@resolvabletobadip'),
meck:unload().

pre_connect_fails_on_one_of_the_nodes(Config) ->
#{ct2 := Node2} = proplists:get_value(nodes, Config),
mock_epmd(),
%% We would get pong on Node2, but would fail an RPC to our hode
pang = rpc(Node2, cets_ping, ping, ['cetsnode1@localhost']),
History = meck:history(erl_epmd),
%% Check that Node2 called us
?assertMatch(
[_],
[
X
|| {_, {erl_epmd, address_please, ["cetsnode1", "localhost", inet]},
{ok, {192, 168, 100, 134}}} = X <- History
],
History
),
meck:unload().

cets_ping_net_family(_Config) ->
inet = cets_ping:net_family(error),
inet = cets_ping:net_family({ok, [["inet"]]}),
Expand Down Expand Up @@ -2613,6 +2639,8 @@ rpc(Node, M, F, Args) when is_atom(Node) ->
Other
end.

%% Set epmd_port for better coverage
extra_args(ct2) -> ["-epmd_port", "4369"];
extra_args(ct5) -> ["-kernel", "prevent_overlapping_partitions", "false"];
extra_args(_) -> "".

Expand Down Expand Up @@ -2928,3 +2956,10 @@ make_signalling_process() ->
stop -> ok
end
end).

mock_epmd() ->
meck:new(erl_epmd, [passthrough, unstick]),
meck:expect(erl_epmd, address_please, fun
("cetsnode1", "localhost", inet) -> {ok, {192, 168, 100, 134}};
(Name, Host, Family) -> meck:passthrough([Name, Host, Family])
end).