Skip to content

Commit

Permalink
(T): cleanup to incorporate TAMM RMM changes
Browse files Browse the repository at this point in the history
  • Loading branch information
abagusetty authored and ajaypanyala committed Dec 18, 2023
1 parent eb5181c commit eb20798
Show file tree
Hide file tree
Showing 11 changed files with 536 additions and 561 deletions.
2 changes: 0 additions & 2 deletions exachem/cc/cc2/cd_cc2_cs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ namespace cc2_cs {
using CCEType = double;
CCSE_Tensors<CCEType> _a021;
TiledIndexSpace o_alpha, v_alpha, o_beta, v_beta;
bool has_gpu_tmp;

Tensor<CCEType> _a01V, _a02V, _a007V;
CCSE_Tensors<CCEType> _a01, _a02, _a03, _a04, _a05, _a06, _a001, _a004, _a006, _a008, _a009, _a017,
Expand Down Expand Up @@ -303,7 +302,6 @@ std::tuple<double, double> cc2_cs::cd_cc2_cs_driver(
const TiledIndexSpace& O = MO("occ");
const TiledIndexSpace& V = MO("virt");
auto [cind] = CI.labels<1>("all");
has_gpu_tmp = ec.has_gpu();

const int otiles = O.num_tiles();
const int vtiles = V.num_tiles();
Expand Down
1 change: 0 additions & 1 deletion exachem/cc/cc2/cd_cc2_os.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ namespace cc2_os {
using CCEType = double;
CCSE_Tensors<CCEType> _a021_os;
TiledIndexSpace o_alpha_os, v_alpha_os, o_beta_os, v_beta_os;
bool has_gpu_tmp_os;

Tensor<CCEType> _a01V_os, _a02V_os, _a007V_os;
CCSE_Tensors<CCEType> _a01_os, _a02_os, _a03_os, _a04_os, _a05_os, _a06_os, _a001_os, _a004_os,
Expand Down
263 changes: 128 additions & 135 deletions exachem/cc/ccsd/cd_ccsd_cs_ann.cpp

Large diffs are not rendered by default.

450 changes: 224 additions & 226 deletions exachem/cc/ccsd/cd_ccsd_os_ann.cpp

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions exachem/cc/ccsd_t/ccsd_t.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,11 @@ void ccsd_t_driver(std::string filename, OptionsMap options_map) {

bool is_restricted = is_rhf;

// Given the singleton pool created by the TAMM is not used by the (T) kernel calculation.
// We artifically destroy the pool
tamm::reset_rmm_pool();
// tamm::reinitialize_rmm_pool();

if(rank == 0) {
if(is_restricted) cout << endl << "Running Closed Shell CCSD(T) calculation" << endl;
else cout << endl << "Running Open Shell CCSD(T) calculation" << endl;
Expand Down
74 changes: 19 additions & 55 deletions exachem/cc/ccsd_t/ccsd_t_all_fused.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,6 @@ void ccsd_t_fully_fused_none_df_none_task(
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
// get (round-robin) GPU stream from pool
gpuStream_t& stream = tamm::GPUStreamPool::getInstance().getStream();
// get GPU memory handle from pool
auto& memPool = tamm::GPUPooledStorageManager::getInstance();
#endif

// Index p4b,p5b,p6b,h1b,h2b,h3b;
Expand Down Expand Up @@ -140,12 +138,12 @@ void ccsd_t_fully_fused_none_df_none_task(
T* host_evl_sorted_p6b = &k_evl_sorted[k_offset[t_p6b]];

#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
T* dev_evl_sorted_h1b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_h1b));
T* dev_evl_sorted_h2b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_h2b));
T* dev_evl_sorted_h3b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_h3b));
T* dev_evl_sorted_p4b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_p4b));
T* dev_evl_sorted_p5b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_p5b));
T* dev_evl_sorted_p6b = static_cast<T*>(memPool.allocate(sizeof(T) * base_size_p6b));
T* dev_evl_sorted_h1b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h1b));
T* dev_evl_sorted_h2b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h2b));
T* dev_evl_sorted_h3b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h3b));
T* dev_evl_sorted_p4b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p4b));
T* dev_evl_sorted_p5b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p5b));
T* dev_evl_sorted_p6b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p6b));

if(!gpuEventQuery(*done_copy)) { gpuEventSynchronize(*done_copy); }
#endif
Expand Down Expand Up @@ -218,38 +216,6 @@ void ccsd_t_fully_fused_none_df_none_task(
size_t num_blocks = CEIL(base_size_h3b, 4) * CEIL(base_size_h2b, 4) * CEIL(base_size_h1b, 4) *
CEIL(base_size_p6b, 4) * CEIL(base_size_p5b, 4) * CEIL(base_size_p4b, 4);

#ifdef OPT_KERNEL_TIMING
//
long double task_num_ops_s1 = 0;
long double task_num_ops_d1 = 0;
long double task_num_ops_d2 = 0;
long double task_num_ops_total = 0;

//
helper_calculate_num_ops(noab, nvab, df_simple_s1_size, df_simple_d1_size, df_simple_d2_size,
df_simple_s1_exec, df_simple_d1_exec, df_simple_d2_exec, task_num_ops_s1,
task_num_ops_d1, task_num_ops_d2, total_num_ops_s1, total_num_ops_d1,
total_num_ops_d2);

//
task_num_ops_total = task_num_ops_s1 + task_num_ops_d1 + task_num_ops_d2;
#endif

#ifdef OPT_KERNEL_TIMING
gpuEvent_t start_kernel_only, stop_kernel_only;

#if defined(USE_CUDA)
CUDA_SAFE(cudaEventCreate(&start_kernel_only));
CUDA_SAFE(cudaEventCreate(&stop_kernel_only));
CUDA_SAFE(cudaEventRecord(start_kernel_only));
#elif defined(USE_HIP)
HIP_SAFE(hipEventCreate(&start_kernel_only));
HIP_SAFE(hipEventCreate(&stop_kernel_only));
HIP_SAFE(hipEventRecord(start_kernel_only));
#endif

#endif // OPT_KERNEL_TIMING

#if defined(USE_DPCPP) || defined(USE_HIP) || (defined(USE_CUDA) && !defined(USE_NV_TC))
fully_fused_ccsd_t_gpu(stream, num_blocks, k_range[t_h1b], k_range[t_h2b], k_range[t_h3b],
k_range[t_p4b], k_range[t_p5b], k_range[t_p6b],
Expand Down Expand Up @@ -303,25 +269,23 @@ void ccsd_t_fully_fused_none_df_none_task(
reduceData->factor = factor;

#ifdef USE_CUDA
CUDA_SAFE(cudaLaunchHostFunc(stream, hostEnergyReduce, reduceData));
CUDA_SAFE(cudaEventRecord(*done_compute, stream));
CUDA_SAFE(cudaLaunchHostFunc(stream.first, hostEnergyReduce, reduceData));
CUDA_SAFE(cudaEventRecord(*done_compute, stream.first));
#elif defined(USE_HIP)
HIP_SAFE(hipLaunchHostFunc(stream, hostEnergyReduce, reduceData));
HIP_SAFE(hipEventRecord(*done_compute, stream));
HIP_SAFE(hipLaunchHostFunc(stream.first, hostEnergyReduce, reduceData));
HIP_SAFE(hipEventRecord(*done_compute, stream.first));
#elif defined(USE_DPCPP)
// TODO: the sync might not be needed (stream.first.ext_oneapi_submit_barrier)
auto host_task_event = stream.submit(
auto host_task_event = stream.first.submit(
[&](sycl::handler& cgh) { cgh.host_task([=]() { hostEnergyReduce(reduceData); }); });
(*done_compute) = stream.ext_oneapi_submit_barrier({host_task_event});
(*done_compute) = stream.first.ext_oneapi_submit_barrier({host_task_event});
#endif

// free device mem back to pool
memPool.deallocate(static_cast<void*>(dev_evl_sorted_h1b), sizeof(T) * base_size_h1b);
memPool.deallocate(static_cast<void*>(dev_evl_sorted_h2b), sizeof(T) * base_size_h2b);
memPool.deallocate(static_cast<void*>(dev_evl_sorted_h3b), sizeof(T) * base_size_h3b);
memPool.deallocate(static_cast<void*>(dev_evl_sorted_p4b), sizeof(T) * base_size_p4b);
memPool.deallocate(static_cast<void*>(dev_evl_sorted_p5b), sizeof(T) * base_size_p5b);
memPool.deallocate(static_cast<void*>(dev_evl_sorted_p6b), sizeof(T) * base_size_p6b);

#endif // if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
freeGpuMem(dev_evl_sorted_h1b);
freeGpuMem(dev_evl_sorted_h2b);
freeGpuMem(dev_evl_sorted_h3b);
freeGpuMem(dev_evl_sorted_p4b);
freeGpuMem(dev_evl_sorted_p5b);
freeGpuMem(dev_evl_sorted_p6b);
#endif
}
16 changes: 8 additions & 8 deletions exachem/cc/ccsd_t/ccsd_t_all_fused_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2591,18 +2591,18 @@ void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, s
// constant memories
//
cudaMemcpyToSymbolAsync(const_d1_h7b, host_size_d1_h7b, sizeof(int) * size_noab, 0,
cudaMemcpyHostToDevice, stream_id);
cudaMemcpyHostToDevice, stream_id.first);
cudaMemcpyToSymbolAsync(const_d2_p7b, host_size_d2_p7b, sizeof(int) * size_nvab, 0,
cudaMemcpyHostToDevice, stream_id);
cudaMemcpyHostToDevice, stream_id.first);

cudaMemcpyToSymbolAsync(const_s1_exec, host_exec_s1, sizeof(int) * (9), 0, cudaMemcpyHostToDevice,
stream_id);
stream_id.first);
cudaMemcpyToSymbolAsync(const_d1_exec, host_exec_d1, sizeof(int) * (9 * size_noab), 0,
cudaMemcpyHostToDevice, stream_id);
cudaMemcpyHostToDevice, stream_id.first);
cudaMemcpyToSymbolAsync(const_d2_exec, host_exec_d2, sizeof(int) * (9 * size_nvab), 0,
cudaMemcpyHostToDevice, stream_id);
cudaMemcpyHostToDevice, stream_id.first);

CUDA_SAFE(cudaEventRecord(*done_copy, stream_id));
CUDA_SAFE(cudaEventRecord(*done_copy, stream_id.first));

// printf ("[new] s1: %d,%d,%d/%d,%d,%d/%d,%d,%d\n", host_exec_s1[0], host_exec_s1[1],
// host_exec_s1[2], host_exec_s1[3], host_exec_s1[4], host_exec_s1[5], host_exec_s1[6],
Expand Down Expand Up @@ -2637,7 +2637,7 @@ void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, s

// T host_energies_zero[2] = {0.0, 0.0};
// cudaMemcpyAsync(dev_energies, host_energies_zero, sizeof(T) * 2, cudaMemcpyHostToDevice,
// stream_id);
// stream_id.first);

//
// cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
Expand All @@ -2647,7 +2647,7 @@ void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, s
// CUCHK(cudaFuncSetAttribute(fused_kernel_d2, cudaFuncAttributeMaxDynamicSharedMemorySize,
// maxbytes));
fully_fused_kernel_ccsd_t_nvidia_tc_fp64<T>
<<<gridsize_1, blocksize_1, 2 * NUM_STAGE * 8 * STAGE_OFFSET, stream_id>>>(
<<<gridsize_1, blocksize_1, 2 * NUM_STAGE * 8 * STAGE_OFFSET, stream_id.first>>>(
(int) size_noab, (int) size_nvab,
//
(int) size_max_dim_s1_t1, (int) size_max_dim_s1_v2, (int) size_max_dim_d1_t2,
Expand Down
Loading

0 comments on commit eb20798

Please sign in to comment.