(T): cleanup to incorporate TAMM RMM changes

ExaChem · Dec 18, 2023 · eb20798 · eb20798
1 parent eb5181c
commit eb20798
Show file tree

Hide file tree

Showing 11 changed files with 536 additions and 561 deletions.
diff --git a/exachem/cc/cc2/cd_cc2_cs.cpp b/exachem/cc/cc2/cd_cc2_cs.cpp
@@ -14,7 +14,6 @@ namespace cc2_cs {
 using CCEType = double;
 CCSE_Tensors<CCEType> _a021;
 TiledIndexSpace       o_alpha, v_alpha, o_beta, v_beta;
-bool                  has_gpu_tmp;
 
 Tensor<CCEType>       _a01V, _a02V, _a007V;
 CCSE_Tensors<CCEType> _a01, _a02, _a03, _a04, _a05, _a06, _a001, _a004, _a006, _a008, _a009, _a017,
@@ -303,7 +302,6 @@ std::tuple<double, double> cc2_cs::cd_cc2_cs_driver(
   const TiledIndexSpace& O = MO("occ");
   const TiledIndexSpace& V = MO("virt");
   auto [cind]              = CI.labels<1>("all");
-  has_gpu_tmp              = ec.has_gpu();
 
   const int otiles  = O.num_tiles();
   const int vtiles  = V.num_tiles();

diff --git a/exachem/cc/cc2/cd_cc2_os.cpp b/exachem/cc/cc2/cd_cc2_os.cpp
@@ -13,7 +13,6 @@ namespace cc2_os {
 using CCEType = double;
 CCSE_Tensors<CCEType> _a021_os;
 TiledIndexSpace       o_alpha_os, v_alpha_os, o_beta_os, v_beta_os;
-bool                  has_gpu_tmp_os;
 
 Tensor<CCEType>       _a01V_os, _a02V_os, _a007V_os;
 CCSE_Tensors<CCEType> _a01_os, _a02_os, _a03_os, _a04_os, _a05_os, _a06_os, _a001_os, _a004_os,

diff --git a/exachem/cc/ccsd/cd_ccsd_cs_ann.cpp b/exachem/cc/ccsd/cd_ccsd_cs_ann.cpp
diff --git a/exachem/cc/ccsd/cd_ccsd_os_ann.cpp b/exachem/cc/ccsd/cd_ccsd_os_ann.cpp
diff --git a/exachem/cc/ccsd_t/ccsd_t.cpp b/exachem/cc/ccsd_t/ccsd_t.cpp
@@ -450,6 +450,11 @@ void ccsd_t_driver(std::string filename, OptionsMap options_map) {
 
   bool is_restricted = is_rhf;
 
+  // Given the singleton pool created by the TAMM is not used by the (T) kernel calculation.
+  // We artifically destroy the pool
+  tamm::reset_rmm_pool();
+  // tamm::reinitialize_rmm_pool();
+
   if(rank == 0) {
     if(is_restricted) cout << endl << "Running Closed Shell CCSD(T) calculation" << endl;
     else cout << endl << "Running Open Shell CCSD(T) calculation" << endl;

diff --git a/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp b/exachem/cc/ccsd_t/ccsd_t_all_fused.hpp
@@ -111,8 +111,6 @@ void ccsd_t_fully_fused_none_df_none_task(
 #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
   // get (round-robin) GPU stream from pool
   gpuStream_t& stream = tamm::GPUStreamPool::getInstance().getStream();
-  // get GPU memory handle from pool
-  auto& memPool = tamm::GPUPooledStorageManager::getInstance();
 #endif
 
   // Index p4b,p5b,p6b,h1b,h2b,h3b;
@@ -140,12 +138,12 @@ void ccsd_t_fully_fused_none_df_none_task(
   T* host_evl_sorted_p6b = &k_evl_sorted[k_offset[t_p6b]];
 
 #if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
-  T* dev_evl_sorted_h1b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_h1b));
-  T* dev_evl_sorted_h2b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_h2b));
-  T* dev_evl_sorted_h3b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_h3b));
-  T* dev_evl_sorted_p4b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_p4b));
-  T* dev_evl_sorted_p5b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_p5b));
-  T* dev_evl_sorted_p6b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_p6b));
+  T* dev_evl_sorted_h1b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h1b));
+  T* dev_evl_sorted_h2b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h2b));
+  T* dev_evl_sorted_h3b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h3b));
+  T* dev_evl_sorted_p4b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p4b));
+  T* dev_evl_sorted_p5b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p5b));
+  T* dev_evl_sorted_p6b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p6b));
 
   if(!gpuEventQuery(*done_copy)) { gpuEventSynchronize(*done_copy); }
 #endif
@@ -218,38 +216,6 @@ void ccsd_t_fully_fused_none_df_none_task(
   size_t num_blocks = CEIL(base_size_h3b, 4) * CEIL(base_size_h2b, 4) * CEIL(base_size_h1b, 4) *
                       CEIL(base_size_p6b, 4) * CEIL(base_size_p5b, 4) * CEIL(base_size_p4b, 4);
 
-#ifdef OPT_KERNEL_TIMING
-  //
-  long double task_num_ops_s1    = 0;
-  long double task_num_ops_d1    = 0;
-  long double task_num_ops_d2    = 0;
-  long double task_num_ops_total = 0;
-
-  //
-  helper_calculate_num_ops(noab, nvab, df_simple_s1_size, df_simple_d1_size, df_simple_d2_size,
-                           df_simple_s1_exec, df_simple_d1_exec, df_simple_d2_exec, task_num_ops_s1,
-                           task_num_ops_d1, task_num_ops_d2, total_num_ops_s1, total_num_ops_d1,
-                           total_num_ops_d2);
-
-  //
-  task_num_ops_total = task_num_ops_s1 + task_num_ops_d1 + task_num_ops_d2;
-#endif
-
-#ifdef OPT_KERNEL_TIMING
-  gpuEvent_t start_kernel_only, stop_kernel_only;
-
-#if defined(USE_CUDA)
-  CUDA_SAFE(cudaEventCreate(&start_kernel_only));
-  CUDA_SAFE(cudaEventCreate(&stop_kernel_only));
-  CUDA_SAFE(cudaEventRecord(start_kernel_only));
-#elif defined(USE_HIP)
-  HIP_SAFE(hipEventCreate(&start_kernel_only));
-  HIP_SAFE(hipEventCreate(&stop_kernel_only));
-  HIP_SAFE(hipEventRecord(start_kernel_only));
-#endif
-
-#endif // OPT_KERNEL_TIMING
-
 #if defined(USE_DPCPP) || defined(USE_HIP) || (defined(USE_CUDA) && !defined(USE_NV_TC))
   fully_fused_ccsd_t_gpu(stream, num_blocks, k_range[t_h1b], k_range[t_h2b], k_range[t_h3b],
                          k_range[t_p4b], k_range[t_p5b], k_range[t_p6b],
@@ -303,25 +269,23 @@ void ccsd_t_fully_fused_none_df_none_task(
   reduceData->factor        = factor;
 
 #ifdef USE_CUDA
-  CUDA_SAFE(cudaLaunchHostFunc(stream, hostEnergyReduce, reduceData));
-  CUDA_SAFE(cudaEventRecord(*done_compute, stream));
+  CUDA_SAFE(cudaLaunchHostFunc(stream.first, hostEnergyReduce, reduceData));
+  CUDA_SAFE(cudaEventRecord(*done_compute, stream.first));
 #elif defined(USE_HIP)
-  HIP_SAFE(hipLaunchHostFunc(stream, hostEnergyReduce, reduceData));
-  HIP_SAFE(hipEventRecord(*done_compute, stream));
+  HIP_SAFE(hipLaunchHostFunc(stream.first, hostEnergyReduce, reduceData));
+  HIP_SAFE(hipEventRecord(*done_compute, stream.first));
 #elif defined(USE_DPCPP)
   // TODO: the sync might not be needed (stream.first.ext_oneapi_submit_barrier)
-  auto host_task_event = stream.submit(
+  auto host_task_event = stream.first.submit(
     [&](sycl::handler& cgh) { cgh.host_task([=]() { hostEnergyReduce(reduceData); }); });
-  (*done_compute) = stream.ext_oneapi_submit_barrier({host_task_event});
+  (*done_compute) = stream.first.ext_oneapi_submit_barrier({host_task_event});
 #endif
 
-  //  free device mem back to pool
-  memPool.deallocate(static_cast<void*>(dev_evl_sorted_h1b), sizeof(T) * base_size_h1b);
-  memPool.deallocate(static_cast<void*>(dev_evl_sorted_h2b), sizeof(T) * base_size_h2b);
-  memPool.deallocate(static_cast<void*>(dev_evl_sorted_h3b), sizeof(T) * base_size_h3b);
-  memPool.deallocate(static_cast<void*>(dev_evl_sorted_p4b), sizeof(T) * base_size_p4b);
-  memPool.deallocate(static_cast<void*>(dev_evl_sorted_p5b), sizeof(T) * base_size_p5b);
-  memPool.deallocate(static_cast<void*>(dev_evl_sorted_p6b), sizeof(T) * base_size_p6b);
-
-#endif // if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
+  freeGpuMem(dev_evl_sorted_h1b);
+  freeGpuMem(dev_evl_sorted_h2b);
+  freeGpuMem(dev_evl_sorted_h3b);
+  freeGpuMem(dev_evl_sorted_p4b);
+  freeGpuMem(dev_evl_sorted_p5b);
+  freeGpuMem(dev_evl_sorted_p6b);
+#endif
 }
diff --git a/exachem/cc/ccsd_t/ccsd_t_all_fused_gpu.cu b/exachem/cc/ccsd_t/ccsd_t_all_fused_gpu.cu
@@ -2591,18 +2591,18 @@ void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, s
   //    constant memories
   //
   cudaMemcpyToSymbolAsync(const_d1_h7b, host_size_d1_h7b, sizeof(int) * size_noab, 0,
-                          cudaMemcpyHostToDevice, stream_id);
+                          cudaMemcpyHostToDevice, stream_id.first);
   cudaMemcpyToSymbolAsync(const_d2_p7b, host_size_d2_p7b, sizeof(int) * size_nvab, 0,
-                          cudaMemcpyHostToDevice, stream_id);
+                          cudaMemcpyHostToDevice, stream_id.first);
 
   cudaMemcpyToSymbolAsync(const_s1_exec, host_exec_s1, sizeof(int) * (9), 0, cudaMemcpyHostToDevice,
-                          stream_id);
+                          stream_id.first);
   cudaMemcpyToSymbolAsync(const_d1_exec, host_exec_d1, sizeof(int) * (9 * size_noab), 0,
-                          cudaMemcpyHostToDevice, stream_id);
+                          cudaMemcpyHostToDevice, stream_id.first);
   cudaMemcpyToSymbolAsync(const_d2_exec, host_exec_d2, sizeof(int) * (9 * size_nvab), 0,
-                          cudaMemcpyHostToDevice, stream_id);
+                          cudaMemcpyHostToDevice, stream_id.first);
 
-  CUDA_SAFE(cudaEventRecord(*done_copy, stream_id));
+  CUDA_SAFE(cudaEventRecord(*done_copy, stream_id.first));
 
   // printf ("[new] s1: %d,%d,%d/%d,%d,%d/%d,%d,%d\n", host_exec_s1[0], host_exec_s1[1],
   // host_exec_s1[2], host_exec_s1[3], host_exec_s1[4], host_exec_s1[5], host_exec_s1[6],
@@ -2637,7 +2637,7 @@ void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, s
 
   // T host_energies_zero[2] = {0.0, 0.0};
   // cudaMemcpyAsync(dev_energies, host_energies_zero, sizeof(T) * 2, cudaMemcpyHostToDevice,
-  // stream_id);
+  // stream_id.first);
 
   //
   // cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
@@ -2647,7 +2647,7 @@ void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, s
   // CUCHK(cudaFuncSetAttribute(fused_kernel_d2, cudaFuncAttributeMaxDynamicSharedMemorySize,
   // maxbytes));
   fully_fused_kernel_ccsd_t_nvidia_tc_fp64<T>
-    <<<gridsize_1, blocksize_1, 2 * NUM_STAGE * 8 * STAGE_OFFSET, stream_id>>>(
+    <<<gridsize_1, blocksize_1, 2 * NUM_STAGE * 8 * STAGE_OFFSET, stream_id.first>>>(
       (int) size_noab, (int) size_nvab,
       //
       (int) size_max_dim_s1_t1, (int) size_max_dim_s1_v2, (int) size_max_dim_d1_t2,