ROCm · avinashkethineedi · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
@@ -215,6 +215,9 @@ if (NOT BUILD_TESTS_ONLY)
       -fgpu-rdc
   )
 
+  # Enable warp sync builtins
+  target_compile_definitions(${PROJECT_NAME} PRIVATE HIP_ENABLE_WARP_SYNC_BUILTINS=1)
+
   #############################################################################
   # INSTALL
   #############################################################################

@@ -68,6 +68,8 @@ namespace envvar {
     const var<std::string> provider("PROVIDER", "");
     const var<bool> alternate_qp_ports("ALTERNATE_QP_PORTS", "", true);
     const var<uint8_t> traffic_class("TRAFFIC_CLASS", "", 0);
+    const var<size_t> num_qps_per_pe_default_ctx("NUM_QPS_PER_PE_DEFAULT_CTX", "", 2);
+    const var<size_t> num_qps_per_pe_usr_ctx("NUM_QPS_PER_PE_USR_CTX", "", 2);
   }  // namespace gda
 
   namespace _detail {

@@ -451,6 +451,10 @@ namespace envvar {
     extern const var<std::string> provider;
     extern const var<bool> alternate_qp_ports;
     extern const var<uint8_t> traffic_class;
+    // Number of QPs to create per PE for the default context
+    extern const var<size_t> num_qps_per_pe_default_ctx;
+    // Number of QPs to create per PE for each user context
+    extern const var<size_t> num_qps_per_pe_usr_ctx;
   }  // namespace gda
 }  // namespace envvar
 }  // namespace rocshmem

@@ -77,6 +77,14 @@ void GDABackend::init() {
 
   select_nic();
 
+  // Determine number of QPs to create per PE
+  num_qps_per_pe = envvar::gda::num_qps_per_pe_default_ctx.get_value() +
+                   envvar::gda::num_qps_per_pe_usr_ctx.get_value() *
+                   envvar::max_num_contexts;
+
+  // Total number of QPs created
+  num_qps = num_qps_per_pe * num_pes;
+
   //TODO setup_host_interface();
   /* Initialize the host interface */
   if (MPI_COMM_NULL != backend_comm)
@@ -743,7 +751,7 @@ void GDABackend::exchange_qp_dest_info() {
     dest_info[i].gid = gid;
   }
 
-  for (size_t i = 0; i < envvar::max_num_contexts + 1; i++) {
+  for (size_t i = 0; i < num_qps_per_pe; i++) {
     if (backend_comm != MPI_COMM_NULL) {
       mpilib_ftable_.Alltoall(MPI_IN_PLACE, sizeof(dest_info_t), MPI_CHAR, dest_info.data() + i * num_pes, sizeof(dest_info_t), MPI_CHAR, backend_comm);
     } else {
@@ -794,7 +802,7 @@ void GDABackend::setup_gpu_qps() {
   size_t qp_objs_count;
   size_t qp_objs_mem_size;
 
-  qp_objs_count    = (envvar::max_num_contexts + 1) * num_pes;
+  qp_objs_count    = num_qps;
   qp_objs_mem_size = sizeof(QueuePair) * qp_objs_count;
 
   CHECK_HIP(hipMalloc(&gpu_qps, qp_objs_mem_size));
@@ -813,7 +821,7 @@ void GDABackend::setup_gpu_qps() {
 void GDABackend::cleanup_gpu_qps() {
   size_t qp_objs_count;
 
-  qp_objs_count = (envvar::max_num_contexts + 1) * num_pes;
+  qp_objs_count = num_qps;
 
   for (size_t i = 0; i < qp_objs_count; i++) {
     host_qps[i].~QueuePair();
@@ -1031,23 +1039,20 @@ void GDABackend::modify_qps_rtr_to_rts() {
 
 void GDABackend::create_queues() {
   int ncqes;
-  size_t resize_length;
 
   if (gda_provider == GDAProvider::IONIC) {
     ncqes = envvar::sq_size << 1;
   } else {
     ncqes = envvar::sq_size;
   }
 
-  resize_length = (envvar::max_num_contexts + 1) * num_pes;
-
-  dest_info.resize(resize_length);
-  cqs.resize(resize_length);
-  qps.resize(resize_length);
+  dest_info.resize(num_qps);
+  cqs.resize(num_qps);
+  qps.resize(num_qps);
 
-  bnxt_scqs.resize(resize_length);
-  bnxt_rcqs.resize(resize_length);
-  bnxt_qps.resize(resize_length);
+  bnxt_scqs.resize(num_qps);
+  bnxt_rcqs.resize(num_qps);
+  bnxt_qps.resize(num_qps);
 
   if (gda_provider == GDAProvider::BNXT) {
     bnxt_create_cqs(ncqes);
@@ -1092,7 +1097,7 @@ void GDABackend::alternate_qp_ports() {
      */
 
     /* Re-Map each context */
-    for (size_t i = 1; i < (envvar::max_num_contexts + 1); i += 2) {
+    for (size_t i = 1; i < num_qps_per_pe; i += 2) {
       for (size_t p = 0; p < num_pes; p += 2) {
         cur_qp_idx = (i * num_pes) + p;
         new_qp_idx = cur_qp_idx + 1;

@@ -107,6 +107,19 @@ class GDABackend : public Backend {
   uint64_t *gpu_db_sq = nullptr;
   /* GDA_IONIC END */
 
+  /**
+   * Determine number of QPs to create per PE =
+   * ROCSHMEM_GDA_NUM_QPS_PER_PE_DEFAULT_CTX +
+   * ROCSHMEM_GDA_NUM_QPS_PER_PE_USR_CTX * ROCSHMEM_MAX_NUM_CONTEXTS
+   */
+  size_t num_qps_per_pe {1};
+
+  /**
+   * Total number of QPs created =
+   * num_qps_per_pe * num_pes;
+   */
+  uint32_t num_qps {1};
+
  /**
    * @brief Choose nic device according to locality/user preferences
    */