From 386162ee4dcd39673623e47127382bc6d0d39954 Mon Sep 17 00:00:00 2001 From: Jung-Sang Ahn Date: Mon, 20 Jan 2025 00:13:12 -0800 Subject: [PATCH] Do `system_exit` on recurring pre-vote failure due to busy connection (#563) * If a connection is stuck due to a network black hole or similar issues, the sender may not receive either a response to the previous request or an explicit error, preventing any progress through that connection. * Such situations are unlikely to resolve on their own, and sometimes restarting the process is the only solution. If the connection remains busy beyond a certain threshold, `system_exit` will be invoked with `N22_unrecoverable_isolation`. --- include/libnuraft/raft_server.hxx | 29 +++++++++++++++++++++++---- src/handle_vote.cxx | 33 +++++++++++++++++++++++++------ src/raft_server.cxx | 4 ++-- 3 files changed, 54 insertions(+), 12 deletions(-) diff --git a/include/libnuraft/raft_server.hxx b/include/libnuraft/raft_server.hxx index b2e5fed7..3f8505a4 100644 --- a/include/libnuraft/raft_server.hxx +++ b/include/libnuraft/raft_server.hxx @@ -117,6 +117,7 @@ public: , reconnect_limit_(50) , leave_limit_(5) , vote_limit_(5) + , busy_connection_limit_(20) {} limits(const limits& src) { @@ -176,6 +177,19 @@ public: * Active only when `auto_adjust_quorum_for_small_cluster_` is enabled. */ std::atomic vote_limit_; + + /** + * If a connection is stuck due to a network black hole or similar issues, + * the sender may not receive either a response to the previous request or + * an explicit error, preventing any progress through that connection. + * Such situations are unlikely to resolve on their own, and sometimes + * restarting the process is the only solution. If the connection remains + * busy beyond this threshold, `system_exit` will be invoked with + * `N22_unrecoverable_isolation`. + * + * If zero, this feature is disabled. + */ + std::atomic busy_connection_limit_; }; raft_server(context* ctx, const init_options& opt = init_options()); @@ -841,18 +855,20 @@ protected: struct pre_vote_status_t { pre_vote_status_t() : quorum_reject_count_(0) - , failure_count_(0) + , no_response_failure_count_(0) + , busy_connection_failure_count_(0) { reset(0); } void reset(ulong _term) { term_ = _term; done_ = false; - live_ = dead_ = abandoned_ = 0; + live_ = dead_ = abandoned_ = connection_busy_ = 0; } ulong term_; std::atomic done_; std::atomic live_; std::atomic dead_; std::atomic abandoned_; + std::atomic connection_busy_; /** * Number of pre-vote rejections by quorum. @@ -860,9 +876,14 @@ protected: std::atomic quorum_reject_count_; /** - * Number of pre-vote failures due to not-responding peers. + * Number of pre-vote failures due to non-responding peers. + */ + std::atomic no_response_failure_count_; + + /** + * Number of pre-vote failures due to busy connections. */ - std::atomic failure_count_; + std::atomic busy_connection_failure_count_; }; /** diff --git a/src/handle_vote.cxx b/src/handle_vote.cxx index 4043c029..0fa951a9 100644 --- a/src/handle_vote.cxx +++ b/src/handle_vote.cxx @@ -18,6 +18,7 @@ See the License for the specific language governing permissions and limitations under the License. **************************************************************************/ +#include "error_code.hxx" #include "raft_server.hxx" #include "cluster_config.hxx" @@ -86,7 +87,7 @@ void raft_server::request_prevote() { if (pre_vote_.live_ + pre_vote_.dead_ > 0) { if (pre_vote_.live_ + pre_vote_.dead_ < quorum_size + 1) { // Pre-vote failed due to non-responding voters. - pre_vote_.failure_count_++; + pre_vote_.no_response_failure_count_++; p_wn("total %d nodes (including this node) responded for pre-vote " "(term %" PRIu64 ", live %d, dead %d), at least %d nodes should " "respond. failure count %d", @@ -95,15 +96,15 @@ void raft_server::request_prevote() { pre_vote_.live_.load(), pre_vote_.dead_.load(), quorum_size + 1, - pre_vote_.failure_count_.load()); + pre_vote_.no_response_failure_count_.load()); } else { - pre_vote_.failure_count_ = 0; + pre_vote_.no_response_failure_count_ = 0; } } int num_voting_members = get_num_voting_members(); if ( params->auto_adjust_quorum_for_small_cluster_ && num_voting_members == 2 && - pre_vote_.failure_count_ > raft_server::raft_limits_.vote_limit_ ) { + pre_vote_.no_response_failure_count_ > raft_server::raft_limits_.vote_limit_ ) { // 2-node cluster's pre-vote failed due to offline node. p_wn("2-node cluster's pre-vote is failing long time, " "adjust quorum to 1"); @@ -166,8 +167,28 @@ void raft_server::request_prevote() { if (pp->make_busy()) { pp->send_req(pp, req, resp_handler_); } else { - p_wn("failed to send prevote request: peer %d (%s) is busy", - pp->get_id(), pp->get_endpoint().c_str()); + pre_vote_.connection_busy_++; + p_wn("failed to send prevote request: peer %d (%s) is busy, count %d", + pp->get_id(), pp->get_endpoint().c_str(), + pre_vote_.connection_busy_.load()); + } + } + + int32 election_quorum_size = get_quorum_for_election() + 1; + if (pre_vote_.connection_busy_ >= election_quorum_size) { + // Couldn't send pre-vote request to majority of peers, + // no hope to get quorum. + pre_vote_.busy_connection_failure_count_++; + p_wn("failed to send prevote request to majority of peers, " + "no hope to get quorum, count: %d", + pre_vote_.busy_connection_failure_count_.load()); + int32_t busy_conn_limit = raft_server::raft_limits_.busy_connection_limit_; + if (busy_conn_limit && + pre_vote_.busy_connection_failure_count_ > busy_conn_limit) { + // LCOV_EXCL_START + p_ft("too many pre-vote failures due to busy connection!"); + ctx_->state_mgr_->system_exit(N22_unrecoverable_isolation); + // LCOV_EXCL_STOP } } } diff --git a/src/raft_server.cxx b/src/raft_server.cxx index ae22564d..38a947bf 100644 --- a/src/raft_server.cxx +++ b/src/raft_server.cxx @@ -1102,7 +1102,7 @@ void raft_server::become_leader() { next_leader_candidate_ = -1; initialized_ = true; pre_vote_.quorum_reject_count_ = 0; - pre_vote_.failure_count_ = 0; + pre_vote_.no_response_failure_count_ = 0; data_fresh_ = true; request_append_entries(); @@ -1418,7 +1418,7 @@ void raft_server::become_follower() { initialized_ = true; uncommitted_config_.reset(); pre_vote_.quorum_reject_count_ = 0; - pre_vote_.failure_count_ = 0; + pre_vote_.no_response_failure_count_ = 0; ptr params = ctx_->get_params(); if ( params->auto_adjust_quorum_for_small_cluster_ &&