diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 1f18ac5f3a8..4723ae766e0 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -1479,6 +1479,9 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv) D_ASSERT(rpc_priv != NULL); + if (rpc_priv->crp_reply_sent != 0) + goto out; + RPC_ADDREF(rpc_priv); hg_ret = HG_Respond(rpc_priv->crp_hg_hdl, crt_hg_reply_send_cb, rpc_priv, &rpc_priv->crp_pub.cr_output); @@ -1490,6 +1493,8 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv) D_GOTO(out, rc = crt_hgret_2_der(hg_ret)); } + rpc_priv->crp_reply_sent = 1; + /* Release input buffer */ if (rpc_priv->crp_release_input_early && !rpc_priv->crp_forward) { hg_ret = HG_Release_input_buf(rpc_priv->crp_hg_hdl); @@ -1513,6 +1518,9 @@ crt_hg_reply_error_send(struct crt_rpc_priv *rpc_priv, int error_code) D_ASSERT(rpc_priv != NULL); D_ASSERT(error_code != 0); + if (rpc_priv->crp_reply_sent != 0) + return; + hg_out_struct = &rpc_priv->crp_pub.cr_output; rpc_priv->crp_reply_hdr.cch_rc = error_code; hg_ret = HG_Respond(rpc_priv->crp_hg_hdl, NULL, NULL, hg_out_struct); @@ -1521,6 +1529,7 @@ crt_hg_reply_error_send(struct crt_rpc_priv *rpc_priv, int error_code) "HG_Respond failed, hg_ret: " DF_HG_RC "\n", DP_HG_RC(hg_ret)); } else { + rpc_priv->crp_reply_sent = 1; RPC_TRACE(DB_NET, rpc_priv, "Sent CART level error message back to client. error_code: %d\n", error_code); diff --git a/src/cart/crt_rpc.h b/src/cart/crt_rpc.h index 496e15ff164..075154866a3 100644 --- a/src/cart/crt_rpc.h +++ b/src/cart/crt_rpc.h @@ -168,6 +168,7 @@ struct crt_rpc_priv { * match with crp_req_hdr.cch_flags. */ uint32_t crp_flags; + /* clang-format off */ uint32_t crp_srv : 1, /* flag of server received request */ crp_output_got : 1, crp_input_got : 1, /* flag of collective RPC request */ @@ -180,6 +181,8 @@ struct crt_rpc_priv { crp_in_binheap : 1, /* set if a call to crt_req_reply pending */ crp_reply_pending : 1, + /* set when RPC reply is sent successfully. */ + crp_reply_sent : 1, /* set to 1 if target ep is set */ crp_have_ep : 1, /* RPC is tracked by the context */ @@ -192,6 +195,7 @@ struct crt_rpc_priv { crp_src_is_primary : 1, /* release input buffer early */ crp_release_input_early : 1; + /* clang-format on */ struct crt_opc_info *crp_opc_info; /* corpc info, only valid when (crp_coll == 1) */ diff --git a/src/chk/chk_rpc.c b/src/chk/chk_rpc.c index e340d482cec..d90653fffc4 100644 --- a/src/chk/chk_rpc.c +++ b/src/chk/chk_rpc.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2022-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -558,6 +559,7 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran * Let's trigger it explicitly to release related buffer. */ chk_start_post_reply(req, NULL); + crt_req_decref(req); if (rc < 0) { rc1 = chk_stop_remote(rank_list, gen, pool_nr, pools, NULL, NULL); @@ -565,8 +567,6 @@ chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_ran D_ERROR("Failed to cleanup DAOS check with gen "DF_X64": "DF_RC"\n", gen, DP_RC(rc1)); } - - crt_req_decref(req); } D_CDEBUG(rc < 0, DLOG_ERR, DLOG_INFO,