Skip to content

Commit 237a40b

Browse files
authored
Fix bug with NCCL resource reclaimation when using multiple grid descriptors. Add NCCL and NVSHMEM resource reclaimation to cudecompGridDescDestroy. (#4)
1 parent f9c5e56 commit 237a40b

File tree

2 files changed

+34
-6
lines changed

2 files changed

+34
-6
lines changed

include/internal/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ struct cudecompHandle {
5555
int32_t local_rank; // MPI rank
5656
int32_t local_nranks; // MPI size
5757

58+
// Entries for NCCL management
59+
int n_grid_descs_using_nccl = 0; // Count of grid descriptors using NCCL
5860
ncclComm_t nccl_comm = nullptr; // NCCL communicator (global)
5961
ncclComm_t nccl_local_comm = nullptr; // NCCL communicator (intranode)
6062

src/cudecomp.cc

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -396,12 +396,17 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
396396
}
397397
}
398398
#endif
399-
if (!transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) &&
400-
!haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
401-
CHECK_NCCL(ncclCommDestroy(handle->nccl_comm));
402-
handle->nccl_comm = nullptr;
403-
CHECK_NCCL(ncclCommDestroy(handle->nccl_local_comm));
404-
handle->nccl_local_comm = nullptr;
399+
if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
400+
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
401+
handle->n_grid_descs_using_nccl++;
402+
} else {
403+
// Destroy NCCL communicator to reclaim resources if not used
404+
if (handle->nccl_comm && handle->nccl_local_comm && handle->n_grid_descs_using_nccl == 0) {
405+
CHECK_NCCL(ncclCommDestroy(handle->nccl_comm));
406+
handle->nccl_comm = nullptr;
407+
CHECK_NCCL(ncclCommDestroy(handle->nccl_local_comm));
408+
handle->nccl_local_comm = nullptr;
409+
}
405410
}
406411

407412
*grid_desc_in = grid_desc;
@@ -437,6 +442,19 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
437442
if (e) { CHECK_CUDA(cudaEventDestroy(e)); }
438443
}
439444

445+
if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
446+
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
447+
handle->n_grid_descs_using_nccl--;
448+
449+
// Destroy NCCL communicator to reclaim resources if not used
450+
if (handle->nccl_comm && handle->nccl_local_comm && handle->n_grid_descs_using_nccl == 0) {
451+
CHECK_NCCL(ncclCommDestroy(handle->nccl_comm));
452+
handle->nccl_comm = nullptr;
453+
CHECK_NCCL(ncclCommDestroy(handle->nccl_local_comm));
454+
handle->nccl_local_comm = nullptr;
455+
}
456+
}
457+
440458
#ifdef ENABLE_NVSHMEM
441459
if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend) ||
442460
haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) {
@@ -447,6 +465,14 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
447465
nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
448466
}
449467
handle->n_grid_descs_using_nvshmem--;
468+
469+
// Finalize nvshmem to reclaim symmetric heap memory if not used
470+
if (handle->nvshmem_initialized && handle->n_grid_descs_using_nvshmem == 0) {
471+
nvshmem_finalize();
472+
handle->nvshmem_initialized = false;
473+
handle->nvshmem_allocations.clear();
474+
handle->nvshmem_allocation_size = 0;
475+
}
450476
}
451477
#endif
452478

0 commit comments

Comments
 (0)