Skip to content

Commit

Permalink
[CCSD(T)] fix a bug related to returned events from host_task
Browse files Browse the repository at this point in the history
  • Loading branch information
abagusetty authored and ajaypanyala committed Jun 2, 2024
1 parent 519edbb commit 2da848f
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 129 deletions.
88 changes: 48 additions & 40 deletions exachem/cc/ccsd_t/ccsd_t_all_fused.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,38 +43,38 @@ void fully_fused_ccsd_t_gpu(gpuStream_t& stream, size_t num_blocks, size_t base_
int* host_d1_size, int* host_d1_exec, // used
int* host_d2_size, int* host_d2_exec, int* host_s1_size,
int* host_s1_exec,
//
//
#ifdef USE_DPCPP
int* const_df_s1_size, int* const_df_s1_exec, int* const_df_d1_size,
int* const_df_d1_exec, int* const_df_d2_size, int* const_df_d2_exec,
#endif // USE_DPCPP
//
size_t size_noab, size_t size_max_dim_d1_t2, size_t size_max_dim_d1_v2,
size_t size_nvab, size_t size_max_dim_d2_t2, size_t size_max_dim_d2_v2,
size_t size_max_dim_s1_t1, size_t size_max_dim_s1_v2,
//
T factor,
//
T* dev_evl_sorted_h1b, T* dev_evl_sorted_h2b, T* dev_evl_sorted_h3b,
T* dev_evl_sorted_p4b, T* dev_evl_sorted_p5b, T* dev_evl_sorted_p6b,
T* partial_energies, gpuEvent_t* done_copy);
T* partial_energies, event_ptr_t done_copy);
#if defined(USE_CUDA) && defined(USE_NV_TC)
// driver for fully-fused kernel for 3rd gen. tensor core (FP64)
template<typename T>
void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream, size_t numBlks, size_t size_h3,
size_t size_h2, size_t size_h1, size_t size_p6,
size_t size_p5, size_t size_p4,
//
T* dev_s1_t1_all, T* dev_s1_v2_all, T* dev_d1_t2_all,
T* dev_d1_v2_all, T* dev_d2_t2_all, T* dev_d2_v2_all,
//
int* host_size_d1_h7b, int* host_size_d2_p7b,
int* host_exec_s1, int* host_exec_d1, int* host_exec_d2,
//
size_t size_noab, size_t size_nvab,
size_t size_max_dim_s1_t1, size_t size_max_dim_s1_v2,
size_t size_max_dim_d1_t2, size_t size_max_dim_d1_v2,
size_t size_max_dim_d2_t2, size_t size_max_dim_d2_v2,
//
T factor, T* dev_evl_sorted_h1b, T* dev_evl_sorted_h2b,
T* dev_evl_sorted_h3b, T* dev_evl_sorted_p4b,
T* dev_evl_sorted_p5b, T* dev_evl_sorted_p6b,
T* dev_final_energies, gpuEvent_t* done_copy);
void ccsd_t_fully_fused_nvidia_tc_fp64(
gpuStream_t& stream, size_t numBlks, size_t size_h3, size_t size_h2, size_t size_h1,
size_t size_p6, size_t size_p5, size_t size_p4,
//
T* dev_s1_t1_all, T* dev_s1_v2_all, T* dev_d1_t2_all, T* dev_d1_v2_all, T* dev_d2_t2_all,
T* dev_d2_v2_all,
//
int* host_size_d1_h7b, int* host_size_d2_p7b, int* host_exec_s1, int* host_exec_d1,
int* host_exec_d2,
//
size_t size_noab, size_t size_nvab, size_t size_max_dim_s1_t1, size_t size_max_dim_s1_v2,
size_t size_max_dim_d1_t2, size_t size_max_dim_d1_v2, size_t size_max_dim_d2_t2,
size_t size_max_dim_d2_v2,
//
T* dev_evl_sorted_h1b, T* dev_evl_sorted_h2b, T* dev_evl_sorted_h3b, T* dev_evl_sorted_p4b,
T* dev_evl_sorted_p5b, T* dev_evl_sorted_p6b, T* dev_final_energies, event_ptr_t done_copy);
#endif

template<typename T>
Expand All @@ -90,6 +90,11 @@ void ccsd_t_fully_fused_none_df_none_task(
//
int* df_simple_s1_size, int* df_simple_d1_size, int* df_simple_d2_size, int* df_simple_s1_exec,
int* df_simple_d1_exec, int* df_simple_d2_exec,
//
#ifdef USE_DPCPP
int* const_df_s1_size, int* const_df_s1_exec, int* const_df_d1_size, int* const_df_d1_exec,
int* const_df_d2_size, int* const_df_d2_exec,
#endif // USE_DPCPP
//
T* df_dev_s1_t1_all, T* df_dev_s1_v2_all, T* df_dev_d1_t2_all, T* df_dev_d1_v2_all,
T* df_dev_d2_t2_all, T* df_dev_d2_v2_all, T* dev_energies,
Expand All @@ -107,9 +112,8 @@ void ccsd_t_fully_fused_none_df_none_task(
LRUCache<Index, std::vector<T>>& cache_s1t, LRUCache<Index, std::vector<T>>& cache_s1v,
LRUCache<Index, std::vector<T>>& cache_d1t, LRUCache<Index, std::vector<T>>& cache_d1v,
LRUCache<Index, std::vector<T>>& cache_d2t, LRUCache<Index, std::vector<T>>& cache_d2v,
gpuEvent_t* done_compute, gpuEvent_t* done_copy) {
event_ptr_t done_compute, event_ptr_t done_copy) {
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
// get (round-robin) GPU stream from pool
gpuStream_t& stream = tamm::GPUStreamPool::getInstance().getStream();
#endif

Expand Down Expand Up @@ -138,13 +142,11 @@ void ccsd_t_fully_fused_none_df_none_task(
T* host_evl_sorted_p6b = &k_evl_sorted[k_offset[t_p6b]];

#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
T* dev_evl_sorted_h1b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h1b));
T* dev_evl_sorted_h2b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h2b));
T* dev_evl_sorted_h3b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h3b));
T* dev_evl_sorted_p4b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p4b));
T* dev_evl_sorted_p5b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p5b));
T* dev_evl_sorted_p6b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p6b));

// done_copy status needs to be "complete", since the host pointers
// such as df_simple_* were being used for validating the const_memory
// variables in the GPU kernel.
// Before these host variables are reset for the new kernel launch
// ensure the mem-copies are complete
if(!gpuEventQuery(*done_copy)) { gpuEventSynchronize(*done_copy); }
#endif

Expand Down Expand Up @@ -186,6 +188,13 @@ void ccsd_t_fully_fused_none_df_none_task(
#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP)
if(!gpuEventQuery(*done_compute)) { gpuEventSynchronize(*done_compute); }

T* dev_evl_sorted_h1b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h1b));
T* dev_evl_sorted_h2b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h2b));
T* dev_evl_sorted_h3b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_h3b));
T* dev_evl_sorted_p4b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p4b));
T* dev_evl_sorted_p5b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p5b));
T* dev_evl_sorted_p6b = static_cast<T*>(getGpuMem(sizeof(T) * base_size_p6b));

gpuMemcpyAsync<T>(dev_evl_sorted_h1b, host_evl_sorted_h1b, base_size_h1b, gpuMemcpyHostToDevice,
stream);
gpuMemcpyAsync<T>(dev_evl_sorted_h2b, host_evl_sorted_h2b, base_size_h2b, gpuMemcpyHostToDevice,
Expand Down Expand Up @@ -227,12 +236,15 @@ void ccsd_t_fully_fused_none_df_none_task(
//
df_simple_d1_size, df_simple_d1_exec, df_simple_d2_size, df_simple_d2_exec,
df_simple_s1_size, df_simple_s1_exec,
//
//
#ifdef USE_DPCPP
const_df_s1_size, const_df_s1_exec, const_df_d1_size, const_df_d1_exec,
const_df_d2_size, const_df_d2_exec,
#endif // USE_DPCPP
//
noab, max_dim_d1_t2, max_dim_d1_v2, nvab, max_dim_d2_t2, max_dim_d2_v2,
max_dim_s1_t1, max_dim_s1_v2,
//
factor,
//
dev_evl_sorted_h1b, dev_evl_sorted_h2b, dev_evl_sorted_h3b,
dev_evl_sorted_p4b, dev_evl_sorted_p5b, dev_evl_sorted_p6b,
//
Expand All @@ -252,8 +264,6 @@ void ccsd_t_fully_fused_none_df_none_task(
noab, nvab, max_dim_s1_t1, max_dim_s1_v2, max_dim_d1_t2,
max_dim_d1_v2, max_dim_d2_t2, max_dim_d2_v2,
//
factor,
//
dev_evl_sorted_h1b, dev_evl_sorted_h2b, dev_evl_sorted_h3b,
dev_evl_sorted_p4b, dev_evl_sorted_p5b, dev_evl_sorted_p6b,
//
Expand All @@ -275,10 +285,8 @@ void ccsd_t_fully_fused_none_df_none_task(
HIP_SAFE(hipLaunchHostFunc(stream.first, hostEnergyReduce, reduceData));
HIP_SAFE(hipEventRecord(*done_compute, stream.first));
#elif defined(USE_DPCPP)
// TODO: the sync might not be needed (stream.first.ext_oneapi_submit_barrier)
auto host_task_event = stream.first.submit(
(*done_compute) = stream.first.submit(
[&](sycl::handler& cgh) { cgh.host_task([=]() { hostEnergyReduce(reduceData); }); });
(*done_compute) = stream.first.ext_oneapi_submit_barrier({host_task_event});
#endif

freeGpuMem(dev_evl_sorted_h1b);
Expand Down
39 changes: 18 additions & 21 deletions exachem/cc/ccsd_t/ccsd_t_all_fused_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2568,25 +2568,22 @@ __global__ __launch_bounds__(256, 3) void fully_fused_kernel_ccsd_t_nvidia_tc_fp
* @brief the driver of the fully-fused kernel for CCSD(T)
**/
template<typename T>
void ccsd_t_fully_fused_nvidia_tc_fp64(gpuStream_t& stream_id, size_t numBlks, size_t size_h3,
size_t size_h2, size_t size_h1, size_t size_p6,
size_t size_p5, size_t size_p4,
//
T* dev_s1_t1_all, T* dev_s1_v2_all, T* dev_d1_t2_all,
T* dev_d1_v2_all, T* dev_d2_t2_all, T* dev_d2_v2_all,
//
int* host_size_d1_h7b, int* host_size_d2_p7b,
int* host_exec_s1, int* host_exec_d1, int* host_exec_d2,
//
size_t size_noab, size_t size_nvab,
size_t size_max_dim_s1_t1, size_t size_max_dim_s1_v2,
size_t size_max_dim_d1_t2, size_t size_max_dim_d1_v2,
size_t size_max_dim_d2_t2, size_t size_max_dim_d2_v2,
//
T factor, T* dev_evl_sorted_h1b, T* dev_evl_sorted_h2b,
T* dev_evl_sorted_h3b, T* dev_evl_sorted_p4b,
T* dev_evl_sorted_p5b, T* dev_evl_sorted_p6b,
T* dev_energies, gpuEvent_t* done_copy) {
void ccsd_t_fully_fused_nvidia_tc_fp64(
gpuStream_t& stream_id, size_t numBlks, size_t size_h3, size_t size_h2, size_t size_h1,
size_t size_p6, size_t size_p5, size_t size_p4,
//
T* dev_s1_t1_all, T* dev_s1_v2_all, T* dev_d1_t2_all, T* dev_d1_v2_all, T* dev_d2_t2_all,
T* dev_d2_v2_all,
//
int* host_size_d1_h7b, int* host_size_d2_p7b, int* host_exec_s1, int* host_exec_d1,
int* host_exec_d2,
//
size_t size_noab, size_t size_nvab, size_t size_max_dim_s1_t1, size_t size_max_dim_s1_v2,
size_t size_max_dim_d1_t2, size_t size_max_dim_d1_v2, size_t size_max_dim_d2_t2,
size_t size_max_dim_d2_v2,
//
T* dev_evl_sorted_h1b, T* dev_evl_sorted_h2b, T* dev_evl_sorted_h3b, T* dev_evl_sorted_p4b,
T* dev_evl_sorted_p5b, T* dev_evl_sorted_p6b, T* dev_energies, event_ptr_t done_copy) {
//
// constant memories
//
Expand Down Expand Up @@ -2696,8 +2693,8 @@ template void ccsd_t_fully_fused_nvidia_tc_fp64<double>(
size_t size_max_dim_d1_t2, size_t size_max_dim_d1_v2, size_t size_max_dim_d2_t2,
size_t size_max_dim_d2_v2,
//
double factor, double* dev_evl_sorted_h1b, double* dev_evl_sorted_h2b, double* dev_evl_sorted_h3b,
double* dev_evl_sorted_h1b, double* dev_evl_sorted_h2b, double* dev_evl_sorted_h3b,
double* dev_evl_sorted_p4b, double* dev_evl_sorted_p5b, double* dev_evl_sorted_p6b,
double* dev_energies, gpuEvent_t* done_copy);
double* dev_energies, event_ptr_t done_copy);

#endif // USE_NV_TC
Loading

0 comments on commit 2da848f

Please sign in to comment.