From d9fe6255befff22475643c2040c45ac15d1882c6 Mon Sep 17 00:00:00 2001 From: Samuel Antao Date: Wed, 31 Oct 2018 14:47:48 +0000 Subject: [PATCH] Initial GPU port based on CUDA. --- .gitignore | 10 + config/cs_auto_flags.sh | 8 +- configure.ac | 61 + src/Makefile.am | 2 +- src/alge/Makefile.am | 1 + src/alge/cs_blas.c | 16 + src/alge/cs_matrix.c | 225 ++- src/alge/cs_multigrid.c | 22 +- src/alge/cs_sles_it.c | 90 +- src/alge/cs_sles_pc.c | 16 + src/apps/Makefile.am | 2 + src/apps/cs_solver.c | 11 + src/base/Makefile.am | 2 + src/base/cs_c_bindings.f90 | 21 + src/base/cs_defs.h | 12 + src/base/cs_equation_iterative_solve.c | 8 + src/base/navstv.f90 | 8 +- src/base/predvv.f90 | 9 +- src/base/resopv.f90 | 15 +- src/cuda/Makefile.am | 99 ++ src/cuda/cs_cuda.cu | 1944 ++++++++++++++++++++++++ src/cuda/cs_cuda.h | 132 ++ 22 files changed, 2688 insertions(+), 26 deletions(-) create mode 100644 src/cuda/Makefile.am create mode 100644 src/cuda/cs_cuda.cu create mode 100644 src/cuda/cs_cuda.h diff --git a/.gitignore b/.gitignore index b15550d837..c12907ee3b 100644 --- a/.gitignore +++ b/.gitignore @@ -221,3 +221,13 @@ Makefile.in # /tests/ /patches/*.patch /src/TAGS + +# Eclipse IDE meta files +/.ptp-sync/ +.ptp-sync-folder +.cproject +.project +.settings + +# Temporary files +*~ diff --git a/config/cs_auto_flags.sh b/config/cs_auto_flags.sh index 6b3ce58b0a..333ba640f0 100644 --- a/config/cs_auto_flags.sh +++ b/config/cs_auto_flags.sh @@ -42,6 +42,7 @@ # cxxflags_default_prf # Added to $CXXFLAGS for profiling (default: "-g") # cxxflags_default_omp # Added to $CXXFLAGS for OpenMP (default: "") # cxxflags_default_std # C++ standard variant (default: "") +# cxxflags_default_cuda_offload # Added to $CXXFLAGS for CUDA Offl. (default: "-std=c++11 -I/usr/local/cuda/include") # fcflags_default # Base FCFLAGS (default: "") # fcflags_default_dbg # Added to $FCFLAGS for debugging (default: "-g") @@ -55,11 +56,13 @@ # ldflags_default_opt # Added to $LDFLAGS for optimization (default: "-O") # ldflags_default_prf # Added to $LDFLAGS for profiling (default: "-g") # ldflags_rpath # Added to $LDFLAGS for shared libs (default: "") +# ldflags_default_cuda_offload # Added to $LDFLAGS for CUDA Offl. (default: "-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib") # libs_default # Base LIBS (default: "") # libs_default_dbg # Added to $LIBS for debugging (default: "") # libs_default_opt # Added to $LIBS for optimization (default: "") # libs_default_prf # Added to $LIBS for profiling (default: "") +# libs_default_cuda_offload # Added to $LDFLAGS for CUDA Offl. (default: "-lcuda -lcudart") # Two other environment variable strings are defined, containing possibly # more detailed compiler information: @@ -299,7 +302,6 @@ elif test "x$cs_gcc" = "xclang"; then cflags_default_dbg="-g -O0" cflags_default_opt="-O2" cflags_default_hot="-O3" - cflags_default_omp="-fopenmp=libomp" # Otherwise, are we using pathcc ? #--------------------------------- @@ -531,6 +533,7 @@ if test "x$cs_gxx" = "xg++"; then cxxflags_default_hot="-O3" cxxflags_default_omp="-fopenmp" cxxflags_default_std="-ansi -funsigned-char" + cxxflags_default_cuda_offload="-std=c++11 -I/usr/local/cuda/include" # Modify default flags on certain systems @@ -1057,6 +1060,9 @@ ldflags_default_prf="-g" if test "x$cs_linker_set" != "xyes" ; then + ldflags_default_cuda_offload="-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib" + libs_default_cuda_offload="-lcuda -lcudart" + case "$host_os" in linux*) diff --git a/configure.ac b/configure.ac index 0d0cdb277a..bb1cbb68c3 100644 --- a/configure.ac +++ b/configure.ac @@ -511,6 +511,62 @@ if test "x$cs_have_openmp_f" = "xyes" ; then AC_FC_LIBRARY_LDFLAGS fi +#------------------------------------------------------------------------------ +# Determine CUDA support +#------------------------------------------------------------------------------ + + +cs_have_cuda_offload=no + +AC_ARG_ENABLE(cuda-offload, + [AS_HELP_STRING([--enable-cuda-offload], [enable CUDA offload])], + [ + case "${enableval}" in + yes) cs_have_cuda_offload=yes ;; + no) cs_have_cuda_offload=no ;; + *) AC_MSG_ERROR([bad value ${enableval} for --enable-cuda-offload]) ;; + esac + ], + [ cs_have_cuda_offload=no ] +) + +AC_MSG_CHECKING([for CUDA support]) +AC_MSG_RESULT($cs_have_cuda_offload) + +if test "x$cs_have_cuda_offload" = "xyes" ; then + + saved_CXXFLAGS="$CXXFLAGS" + saved_LDFLAGS="$LDFLAGS" + + # Select compute capabilities we want to support. + CUDA_COMPUTE_CAPABILITIES="" + CUDA_COMPUTE_CAPABILITIES="${CUDA_COMPUTE_CAPABILITIES} -gencode arch=compute_35,code=sm_35" + CUDA_COMPUTE_CAPABILITIES="${CUDA_COMPUTE_CAPABILITIES} -gencode arch=compute_37,code=sm_37" + CUDA_COMPUTE_CAPABILITIES="${CUDA_COMPUTE_CAPABILITIES} -gencode arch=compute_60,code=sm_60" + CUDA_COMPUTE_CAPABILITIES="${CUDA_COMPUTE_CAPABILITIES} -gencode arch=compute_70,code=sm_70" + + # Add CUDA offload default options + CXXFLAGS="${CXXFLAGS} ${cxxflags_default_cuda_offload}" + CUDAFLAGS="-DHAVE_CONFIG_H -I../../ ${CUDA_COMPUTE_CAPABILITIES} --maxrregcount=64 -Xptxas -v " + + if test "x$debug" = xyes; then + CUDAFLAGS="${CUDAFLAGS} --device-debug" + fi + # Make sure we link against CUDA libraries + LDFLAGS="${LDFLAGS} ${ldflags_default_cuda_offload}" + LIBS="${LIBS} ${libs_default_cuda_offload}" + + # Wrap C++ compilers around nvcc + CUDACC="nvcc -ccbin $CXX" + AC_SUBST(CUDACC) + AC_SUBST(CUDAFLAGS) + AC_DEFINE([HAVE_CUDA_OFFLOAD], 1, [CUDA Offload Support]) +fi + +AC_SUBST(cs_have_cuda_offload) +AM_CONDITIONAL(HAVE_CUDA_OFFLOAD, [test "${cs_have_cuda_offload}" = yes]) + + #------------------------------------------------------------------------------ # Checks for Python support. #------------------------------------------------------------------------------ @@ -694,6 +750,10 @@ elif test x$cs_have_catalyst = xyes ; then cs_have_link_cxx=yes fi fi +# CUDA files are interpreted as C++ files. +if test x$cs_have_cuda_offload = xyes ; then + cs_have_link_cxx=yes +fi if test x$user_CS_LD != "x" ; then CS_LD=$user_CS_LD elif test $cs_have_link_cxx = yes ; then @@ -1161,6 +1221,7 @@ AC_CONFIG_FILES([Makefile po/Makefile.in src/pprt/Makefile src/lagr/Makefile src/rayt/Makefile src/turb/Makefile src/alge/Makefile src/mesh/Makefile src/user/Makefile src/user_examples/Makefile + src/cuda/Makefile gui/Makefile gui/Base/Makefile gui/Pages/Makefile gui/studymanager_gui/Makefile gui/trackcvg/Makefile salome/fsi_coupling/Makefile diff --git a/src/Makefile.am b/src/Makefile.am index 59619f8002..33751c8d9d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -27,7 +27,7 @@ SUBDIRS = . bft mei if HAVE_BACKEND SUBDIRS += \ fvm gui \ -base cdo pprt alge mesh turb darc \ +base cuda cdo pprt alge mesh turb darc \ atmo cfbl cogz comb ctwr elec lagr rayt \ user user_examples apps endif diff --git a/src/alge/Makefile.am b/src/alge/Makefile.am index 844a7c94f6..4e193a9691 100644 --- a/src/alge/Makefile.am +++ b/src/alge/Makefile.am @@ -34,6 +34,7 @@ endif AM_CPPFLAGS = \ -I$(top_srcdir)/src/bft \ +-I$(top_srcdir)/src/cuda \ -I$(top_srcdir)/src/fvm \ -I$(top_srcdir)/src/mei \ -I$(top_srcdir)/src/base \ diff --git a/src/alge/cs_blas.c b/src/alge/cs_blas.c index 6219a4c88d..f012986014 100644 --- a/src/alge/cs_blas.c +++ b/src/alge/cs_blas.c @@ -38,6 +38,7 @@ *----------------------------------------------------------------------------*/ #include "cs_base.h" +#include "cs_cuda.h" #include "cs_parall.h" /*---------------------------------------------------------------------------- @@ -299,6 +300,11 @@ _cs_dot_xx_superblock(cs_lnum_t n, { double dot_xx = 0.0; +# ifdef HAVE_CUDA_OFFLOAD + if (cs_cuda_dot_product_xx(&dot_xx, x, n)) + return dot_xx; +# endif + # pragma omp parallel reduction(+:dot_xx) if (n > CS_THR_MIN) { cs_lnum_t s_id, e_id; @@ -360,6 +366,11 @@ _cs_dot_xx_xy_superblock(cs_lnum_t n, double *xx, double *xy) { +# ifdef HAVE_CUDA_OFFLOAD + if (cs_cuda_dot_product_xx_xy(xx, xy, x, y,n)) + return; +# endif + double dot_xx = 0.0, dot_xy = 0.0; # pragma omp parallel reduction(+:dot_xx, dot_xy) if (n > CS_THR_MIN) @@ -433,6 +444,11 @@ _cs_dot_xy_yz_superblock(cs_lnum_t n, double *xy, double *yz) { +# ifdef HAVE_CUDA_OFFLOAD + if (cs_cuda_dot_product_xy_yz(xy, yz, x, y, z, n)) + return; +# endif + double dot_xy = 0.0, dot_yz = 0.0; # pragma omp parallel reduction(+:dot_xy, dot_yz) if (n > CS_THR_MIN) diff --git a/src/alge/cs_matrix.c b/src/alge/cs_matrix.c index 1e34187fe0..b3ca767950 100644 --- a/src/alge/cs_matrix.c +++ b/src/alge/cs_matrix.c @@ -79,6 +79,7 @@ #include "cs_base.h" #include "cs_blas.h" +#include "cs_cuda.h" #include "cs_halo.h" #include "cs_halo_perio.h" #include "cs_log.h" @@ -181,6 +182,122 @@ static const char *_matrix_operation_name[CS_MATRIX_N_FILL_TYPES][2] * Private function definitions *============================================================================*/ +#ifdef HAVE_CUDA_OFFLOAD + +typedef enum __offload_map_type { + OFFLOAD_MAP, + OFFLOAD_UNMAP, +} offload_map_type; + +static void _offload_map_coeffs(const cs_matrix_t *matrix, offload_map_type map_type) { + + const cs_real_t *mc_x_val = 0; + const cs_real_t *mc_d_val = 0; + size_t ms_n_rows = 0; + size_t x_val_size = 0; + size_t d_val_size = 0; + + switch(matrix->type) { + default: + // Unknown type. + return; + case CS_MATRIX_NATIVE: + // Native matrix - not implemented. + return; + case CS_MATRIX_MSR: { + const cs_matrix_struct_csr_t *mst = matrix->structure; + const cs_matrix_coeff_msr_t *mc = matrix->coeffs; + if (!mst || !mc) + return; + mc_x_val = mc->x_val; + mc_d_val = mc->d_val; + ms_n_rows = mst->n_rows; + x_val_size = mst->row_index[ms_n_rows]; + d_val_size = ms_n_rows; + } break; + case CS_MATRIX_CSR: + case CS_MATRIX_CSR_SYM: { + const cs_matrix_struct_csr_t *mst = matrix->structure; + const cs_matrix_coeff_csr_t *mc = matrix->coeffs; + if (!mst || !mc) + return; + mc_x_val = mc->val; + ms_n_rows = mst->n_rows; + x_val_size = mst->_row_index[ms_n_rows]; + d_val_size = ms_n_rows; + } break; + } + + switch(map_type) { + case OFFLOAD_MAP: + if (mc_d_val && mc_x_val) { + if (ms_n_rows > CS_CUDA_GPU_THRESHOLD) { + cs_cuda_map_to(mc_x_val, x_val_size*sizeof(cs_real_t)); + cs_cuda_map_to(mc_d_val, d_val_size*sizeof(cs_real_t)); + } + } else if (mc_x_val) { + if (ms_n_rows > CS_CUDA_GPU_THRESHOLD) { + cs_cuda_map_to(mc_x_val, x_val_size*sizeof(cs_real_t)); + } + } break; + case OFFLOAD_UNMAP: + if (mc_d_val && mc_x_val) { + if (ms_n_rows > CS_CUDA_GPU_THRESHOLD) { + cs_cuda_map_release(mc_x_val, 0); + cs_cuda_map_release(mc_d_val, 0); + } + } else if (mc_x_val) { + if (ms_n_rows > CS_CUDA_GPU_THRESHOLD) { + cs_cuda_map_release(mc_x_val, 0); + } + } break; + } +} + +static void _offload_map_structure(const cs_matrix_structure_t *ms, offload_map_type map_type) { + const cs_lnum_t *ms_col_id = 0; + const cs_lnum_t *ms_row_index = 0; + + size_t ms_n_rows = 0; + size_t col_id_size = 0; + size_t row_index_size = 0; + + switch(ms->type) { + default: + // Unknown matrix type. + return; + case CS_MATRIX_NATIVE: + // Native matrix - not implemented. + return; + case CS_MATRIX_MSR: + case CS_MATRIX_CSR: + case CS_MATRIX_CSR_SYM: { + const cs_matrix_struct_csr_t *mst = ms->structure; + ms_col_id = mst->col_id; + ms_row_index = mst->row_index; + ms_n_rows = mst->n_rows; + col_id_size = mst->row_index[ms_n_rows]; + row_index_size = ms_n_rows+1; + } break; + } + + switch(map_type) { + case OFFLOAD_MAP: + if (ms_n_rows > CS_CUDA_GPU_THRESHOLD) { + cs_cuda_map_to(ms_col_id, col_id_size*sizeof(cs_lnum_t)); + cs_cuda_map_to(ms_row_index, row_index_size*sizeof(cs_lnum_t)); + } + break; + case OFFLOAD_UNMAP: + if (ms_n_rows > CS_CUDA_GPU_THRESHOLD) { + cs_cuda_map_release(ms_col_id, 0); + cs_cuda_map_release(ms_row_index, 0); + } + break; + } +} +#endif + /*---------------------------------------------------------------------------- * Set matrix fill metadata. * @@ -822,6 +939,11 @@ _set_coeffs_native(cs_matrix_t *matrix, mc->xa = xa; } + +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_MAP); +#endif + } /*---------------------------------------------------------------------------- @@ -834,6 +956,9 @@ _set_coeffs_native(cs_matrix_t *matrix, static void _release_coeffs_native(cs_matrix_t *matrix) { +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_UNMAP); +#endif cs_matrix_coeff_native_t *mc = matrix->coeffs; if (mc != NULL) { mc->da = NULL; @@ -2287,6 +2412,9 @@ _set_coeffs_csr(cs_matrix_t *matrix, } /* (matrix->edges != NULL) */ +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_MAP); +#endif } /*---------------------------------------------------------------------------- @@ -2462,6 +2590,10 @@ _set_coeffs_csr_from_msr(cs_matrix_t *matrix, else _zero_coeffs_csr(matrix); +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_MAP); +#endif + /* Now free transferred arrays */ if (d_vals_transfer != NULL) @@ -2480,6 +2612,10 @@ _set_coeffs_csr_from_msr(cs_matrix_t *matrix, static void _release_coeffs_csr(cs_matrix_t *matrix) { +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_UNMAP); +#endif + cs_matrix_coeff_csr_t *mc = matrix->coeffs; if (mc != NULL) mc->d_val = NULL; @@ -2539,6 +2675,19 @@ _mat_vec_p_l_csr(bool exclude_diag, const cs_matrix_struct_csr_t *ms = matrix->structure; const cs_matrix_coeff_csr_t *mc = matrix->coeffs; cs_lnum_t n_rows = ms->n_rows; + cs_lnum_t n_cols_ext = ms->n_cols_ext; + +# ifdef HAVE_CUDA_OFFLOAD + // Use CUDA for this if available. + if (cs_cuda_mat_vec_p_l_csr( + exclude_diag, + ms->row_index, + ms->col_id, + mc->val, + x, y, + n_rows, n_cols_ext)) + return; +# endif /* Standard case */ @@ -3030,6 +3179,9 @@ _set_coeffs_csr_sym(cs_matrix_t *matrix, } /* (edges != NULL) */ +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_MAP); +#endif } /*---------------------------------------------------------------------------- @@ -3042,6 +3194,10 @@ _set_coeffs_csr_sym(cs_matrix_t *matrix, static void _release_coeffs_csr_sym(cs_matrix_t *matrix) { +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_UNMAP); +#endif + cs_matrix_coeff_csr_sym_t *mc = matrix->coeffs; if (mc != NULL) mc->d_val = NULL; @@ -3589,6 +3745,10 @@ _set_coeffs_msr(cs_matrix_t *matrix, if (xa != NULL) _set_xa_coeffs_msr_increment(matrix, symmetric, n_edges, edges, xa); } + +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_MAP); +#endif } /*---------------------------------------------------------------------------- @@ -3661,6 +3821,10 @@ _set_coeffs_msr_from_msr(cs_matrix_t *matrix, if (x_transferred == false) _map_or_copy_xa_coeffs_msr(matrix, copy, x_vals); +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_MAP); +#endif + /* Now free transferred arrays */ if (d_vals_transfer != NULL) @@ -3679,6 +3843,10 @@ _set_coeffs_msr_from_msr(cs_matrix_t *matrix, static void _release_coeffs_msr(cs_matrix_t *matrix) { +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(matrix, OFFLOAD_UNMAP); +#endif + cs_matrix_coeff_msr_t *mc = matrix->coeffs; if (mc !=NULL) { /* Unmap shared values */ @@ -4037,6 +4205,20 @@ _mat_vec_p_l_msr(bool exclude_diag, const cs_matrix_struct_csr_t *ms = matrix->structure; const cs_matrix_coeff_msr_t *mc = matrix->coeffs; cs_lnum_t n_rows = ms->n_rows; + cs_lnum_t n_cols_ext = ms->n_cols_ext; + + // Use CUDA if available. +# ifdef HAVE_CUDA_OFFLOAD + if (cs_cuda_mat_vec_p_l_msr( + exclude_diag, + ms->row_index, + ms->col_id, + mc->x_val, + mc->d_val, + x, y, + n_rows, n_cols_ext)) + return; +# endif /* Standard case */ @@ -4487,18 +4669,26 @@ _pre_vector_multiply_sync_x(cs_halo_rotation_t rotation_mode, const cs_matrix_t *matrix, cs_real_t x[restrict]) { + size_t n_cols_ext = matrix->n_cols_ext; + assert(matrix->halo != NULL); /* Non-blocked version */ if (matrix->db_size[3] == 1) { - if (matrix->halo != NULL) + if (matrix->halo != NULL) { +# ifdef HAVE_CUDA_OFFLOAD + cs_cuda_move_from_device_if_exists(matrix->n_rows,n_cols_ext, x); +# endif cs_halo_sync_component(matrix->halo, CS_HALO_STANDARD, rotation_mode, x); - +# ifdef HAVE_CUDA_OFFLOAD + cs_cuda_move_to_device_if_exists(matrix->n_rows,n_cols_ext-matrix->halo->n_local_elts, x+matrix->halo->n_local_elts); +# endif + } } /* Blocked version */ @@ -4553,10 +4743,13 @@ _pre_vector_multiply_sync_y(const cs_matrix_t *matrix, { size_t n_cols_ext = matrix->n_cols_ext; - if (matrix->db_size[3] == 1) + if (matrix->db_size[3] == 1) { +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_vector_vc_equal_zero_if_exists(matrix->n_rows,n_cols_ext-matrix->n_rows, y+matrix->n_rows)) +# endif _zero_range(y, matrix->n_rows, n_cols_ext); - else + } else _b_zero_range(y, matrix->n_rows, n_cols_ext, matrix->db_size); } @@ -5579,6 +5772,10 @@ cs_matrix_structure_create(cs_matrix_type_t type, ms->numbering = numbering; ms->assembler = NULL; +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_structure(ms, OFFLOAD_MAP); +#endif + return ms; } @@ -5669,6 +5866,10 @@ cs_matrix_structure_create_msr(cs_matrix_type_t type, ms->numbering = numbering; ms->assembler = NULL; +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_structure(ms, OFFLOAD_MAP); +#endif + return ms; } @@ -5732,6 +5933,10 @@ cs_matrix_structure_create_msr_shared(bool have_diag, ms->numbering = numbering; ms->assembler = NULL; +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_structure(ms, OFFLOAD_MAP); +#endif + return ms; } @@ -5776,6 +5981,10 @@ cs_matrix_structure_create_from_assembler(cs_matrix_type_t type, ms->assembler = ma; +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_structure(ms, OFFLOAD_MAP); +#endif + return ms; } @@ -5794,6 +6003,10 @@ cs_matrix_structure_destroy(cs_matrix_structure_t **ms) cs_matrix_structure_t *_ms = *ms; +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_structure(_ms, OFFLOAD_UNMAP); +#endif + _structure_destroy(_ms->type, &(_ms->structure)); /* Now free main structure */ @@ -5983,6 +6196,10 @@ cs_matrix_destroy(cs_matrix_t **matrix) cs_matrix_t *m = *matrix; +#ifdef HAVE_CUDA_OFFLOAD + _offload_map_coeffs(m, OFFLOAD_UNMAP); +#endif + switch(m->type) { case CS_MATRIX_NATIVE: { diff --git a/src/alge/cs_multigrid.c b/src/alge/cs_multigrid.c index 54f9396804..ed526dcf6a 100644 --- a/src/alge/cs_multigrid.c +++ b/src/alge/cs_multigrid.c @@ -51,6 +51,7 @@ #include "cs_base.h" #include "cs_blas.h" +#include "cs_cuda.h" #include "cs_file.h" #include "cs_grid.h" #include "cs_halo.h" @@ -1234,6 +1235,10 @@ _multigrid_setup_sles_it(cs_multigrid_t *mg, wr_size1 += block_size; } +# ifdef HAVE_CUDA_OFFLOAD + mgd->rhs_vx_buf = cs_cuda_host_alloc((wr_size0*n0 + wr_size1*n1)*sizeof(cs_real_t)); + if (!mgd->rhs_vx_buf) +# endif BFT_MALLOC(mgd->rhs_vx_buf, wr_size0*n0 + wr_size1*n1, cs_real_t); size_t block_size_shift = 0; @@ -3613,9 +3618,13 @@ cs_multigrid_solve(void *context, + n_rows * 6 * db_size[1] * sizeof(cs_real_t); unsigned char *_aux_buf = aux_vectors; - if (_aux_size > aux_size) + if (_aux_size > aux_size) { +# ifdef HAVE_CUDA_OFFLOAD + _aux_buf = cs_cuda_host_alloc(_aux_size*sizeof(unsigned char)); + if (!_aux_buf) +# endif BFT_MALLOC(_aux_buf, _aux_size, unsigned char); - else + } else _aux_size = aux_size; _level_names_init(name, mg->setup_data->n_levels, _aux_buf); @@ -3673,8 +3682,12 @@ cs_multigrid_solve(void *context, n_cycles++; } - if (_aux_buf != aux_vectors) + if (_aux_buf != aux_vectors) { +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_host_free(_aux_buf)) +# endif BFT_FREE(_aux_buf); + } /* Update statistics */ @@ -3734,6 +3747,9 @@ cs_multigrid_free(void *context) /* Free coarse solution data */ BFT_FREE(mgd->rhs_vx); +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_host_free(mgd->rhs_vx_buf)) +# endif BFT_FREE(mgd->rhs_vx_buf); /* Destroy solver hierarchy */ diff --git a/src/alge/cs_sles_it.c b/src/alge/cs_sles_it.c index e51b0c041e..ce6b8c922f 100644 --- a/src/alge/cs_sles_it.c +++ b/src/alge/cs_sles_it.c @@ -51,6 +51,7 @@ #include "cs_base.h" #include "cs_blas.h" +#include "cs_cuda.h" #include "cs_file.h" #include "cs_log.h" #include "cs_halo.h" @@ -918,6 +919,10 @@ _setup_sles_it(cs_sles_it_t *c, } if (s != NULL) { +#ifdef HAVE_CUDA_OFFLOAD + if (sd->n_rows > CS_CUDA_GPU_THRESHOLD && sd->_ad_inv) + cs_cuda_map_release(sd->_ad_inv, 0); +#endif sd->ad_inv = s->setup_data->ad_inv; BFT_FREE(sd->_ad_inv); } @@ -939,6 +944,10 @@ _setup_sles_it(cs_sles_it_t *c, for (cs_lnum_t i = 0; i < n_rows; i++) sd->_ad_inv[i] = 1.0 / sd->_ad_inv[i]; +#ifdef HAVE_CUDA_OFFLOAD + if (n_rows > CS_CUDA_GPU_THRESHOLD) + cs_cuda_map_to(sd->_ad_inv, n_rows*sizeof(cs_real_t)); +#endif } else { @@ -950,6 +959,11 @@ _setup_sles_it(cs_sles_it_t *c, else _fact_lu(n_blocks, diag_block_size, ad, sd->_ad_inv); +#ifdef HAVE_CUDA_OFFLOAD + if (sd->n_rows > CS_CUDA_GPU_THRESHOLD) + cs_cuda_map_to(sd->_ad_inv, sd->n_rows*diag_block_size*sizeof(cs_real_t)); +#endif + } } @@ -1005,17 +1019,22 @@ _conjugate_gradient(cs_sles_it_t *c, assert(c->setup_data != NULL); const cs_lnum_t n_rows = c->setup_data->n_rows; - + cs_lnum_t tmp_size = 0; { const cs_lnum_t n_cols = cs_matrix_get_n_columns(a) * diag_block_size; const size_t n_wa = 4; const size_t wa_size = CS_SIMD_SIZE(n_cols); - if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa)) + if (aux_vectors == NULL || aux_size/sizeof(cs_real_t) < (wa_size * n_wa)) { +# ifdef HAVE_CUDA_OFFLOAD + _aux_vectors = cs_cuda_host_alloc(wa_size * n_wa * sizeof(cs_real_t)); + if (!_aux_vectors) +# endif BFT_MALLOC(_aux_vectors, wa_size * n_wa, cs_real_t); - else + } else _aux_vectors = aux_vectors; + tmp_size = wa_size * n_wa; rk = _aux_vectors; dk = _aux_vectors + wa_size; gk = _aux_vectors + wa_size*2; @@ -1027,8 +1046,22 @@ _conjugate_gradient(cs_sles_it_t *c, /* Residue and descent direction */ + +#ifdef HAVE_CUDA_OFFLOAD + const cs_lnum_t vx_size = cs_matrix_get_n_columns(a); + if (n_rows > CS_CUDA_GPU_THRESHOLD) { + cs_cuda_map_to(vx,vx_size*sizeof(cs_real_t)); + cs_cuda_map_to(rhs,n_rows*sizeof(cs_real_t)); + cs_cuda_map_alloc(_aux_vectors,tmp_size*sizeof(cs_real_t)); + } +#endif + + // vx will be mapped here if not maaped before. cs_matrix_vector_multiply(rotation_mode, a, vx, rk); /* rk = A.x0 */ +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_vector_vc_sub_equal_va(n_rows, rk, rhs)) +# endif # pragma omp parallel for if(n_rows > CS_THR_MIN) for (cs_lnum_t ii = 0; ii < n_rows; ii++) rk[ii] -= rhs[ii]; @@ -1043,8 +1076,11 @@ _conjugate_gradient(cs_sles_it_t *c, /* Descent direction */ /*-------------------*/ -#if defined(HAVE_OPENMP) +# ifdef HAVE_CUDA_OFFLOAD + if(!cs_cuda_vector_vc_equal_va(n_rows,dk, gk)) +# endif +#if defined(HAVE_OPENMP) # pragma omp parallel for if(n_rows > CS_THR_MIN) for (cs_lnum_t ii = 0; ii < n_rows; ii++) dk[ii] = gk[ii]; @@ -1074,7 +1110,9 @@ _conjugate_gradient(cs_sles_it_t *c, _dot_products_xy_yz(c, rk, dk, zk, &ro_0, &ro_1); alpha = - ro_0 / ro_1; - +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_vector_vc_add_equal_s_mul_vb(n_rows,alpha,vx,dk,rk,zk)) +# endif # pragma omp parallel if(n_rows > CS_THR_MIN) { # pragma omp for nowait @@ -1126,6 +1164,9 @@ _conjugate_gradient(cs_sles_it_t *c, beta = rk_gk / rk_gkm1; rk_gkm1 = rk_gk; +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_vector_vc_equal_va_add_s_mul_vb(n_rows, beta, dk, gk, dk)) +# endif # pragma omp parallel for firstprivate(alpha) if(n_rows > CS_THR_MIN) for (cs_lnum_t ii = 0; ii < n_rows; ii++) dk[ii] = gk[ii] + (beta * dk[ii]); @@ -1136,6 +1177,9 @@ _conjugate_gradient(cs_sles_it_t *c, alpha = - ro_0 / ro_1; +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_vector_vc_add_equal_s_mul_vb(n_rows, alpha, vx, dk, rk, zk)) +# endif # pragma omp parallel if(n_rows > CS_THR_MIN) { # pragma omp for nowait @@ -1146,11 +1190,22 @@ _conjugate_gradient(cs_sles_it_t *c, for (cs_lnum_t ii = 0; ii < n_rows; ii++) rk[ii] += (alpha * zk[ii]); } + } +#ifdef HAVE_CUDA_OFFLOAD + if (n_rows > CS_CUDA_GPU_THRESHOLD) { + cs_cuda_map_release(_aux_vectors,tmp_size*sizeof(cs_real_t)); + cs_cuda_map_release(rhs,n_rows*sizeof(cs_real_t)); + cs_cuda_map_from_sync(vx,vx_size*sizeof(cs_real_t)); } +#endif - if (_aux_vectors != aux_vectors) + if (_aux_vectors != aux_vectors) { +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_host_free(_aux_vectors)) +# endif BFT_FREE(_aux_vectors); + } return cvg; } @@ -4003,6 +4058,7 @@ _p_sym_gauss_seidel_msr(cs_sles_it_t *c, unsigned n_iter = 0; const cs_lnum_t n_rows = cs_matrix_get_n_rows(a); + const cs_lnum_t n_colsext = cs_matrix_get_n_columns(a); const cs_halo_t *halo = cs_matrix_get_halo(a); @@ -4021,6 +4077,11 @@ _p_sym_gauss_seidel_msr(cs_sles_it_t *c, /* Current iteration */ /*-------------------*/ +#ifdef HAVE_CUDA_OFFLOAD + if (n_rows > CS_CUDA_GPU_THRESHOLD) + cs_cuda_map_to(rhs, diag_block_size*n_rows*sizeof(cs_real_t)); +#endif + while (cvg == CS_SLES_ITERATING) { n_iter += 1; @@ -4052,7 +4113,9 @@ _p_sym_gauss_seidel_msr(cs_sles_it_t *c, } else { - +# ifdef HAVE_CUDA_OFFLOAD + if(!cs_cuda_seidel_forward(a_row_index,a_col_id,a_x_val, ad_inv, rhs, vx, diag_block_size, db_size, n_rows, n_colsext)) +# endif # pragma omp parallel for if(n_rows > CS_THR_MIN && !_thread_debug) for (cs_lnum_t ii = 0; ii < n_rows; ii++) { @@ -4117,7 +4180,9 @@ _p_sym_gauss_seidel_msr(cs_sles_it_t *c, } else { - +# ifdef HAVE_CUDA_OFFLOAD + if(!cs_cuda_seidel_backward(a_row_index, a_col_id, a_x_val, ad_inv, ad, rhs, vx, &res2, diag_block_size, db_size, n_rows, n_colsext)) +# endif # pragma omp parallel for reduction(+:res2) \ if(n_rows > CS_THR_MIN && !_thread_debug) for (cs_lnum_t ii = n_rows - 1; ii > - 1; ii--) { @@ -4184,6 +4249,11 @@ _p_sym_gauss_seidel_msr(cs_sles_it_t *c, } +#ifdef HAVE_CUDA_OFFLOAD + if (n_rows > CS_CUDA_GPU_THRESHOLD) + cs_cuda_map_release(rhs, 0); +#endif + return cvg; } @@ -5305,6 +5375,10 @@ cs_sles_it_free(void *context) cs_sles_pc_free(c->_pc); if (c->setup_data != NULL) { +#ifdef HAVE_CUDA_OFFLOAD + if (c->setup_data->n_rows > CS_CUDA_GPU_THRESHOLD && c->setup_data->_ad_inv) + cs_cuda_map_release(c->setup_data->_ad_inv, 0); +#endif BFT_FREE(c->setup_data->_ad_inv); BFT_FREE(c->setup_data); } diff --git a/src/alge/cs_sles_pc.c b/src/alge/cs_sles_pc.c index 02db260182..ac205e32ee 100644 --- a/src/alge/cs_sles_pc.c +++ b/src/alge/cs_sles_pc.c @@ -51,6 +51,7 @@ #include "cs_base.h" #include "cs_blas.h" +#include "cs_cuda.h" #include "cs_field.h" #include "cs_log.h" #include "cs_halo.h" @@ -402,6 +403,10 @@ _sles_pc_poly_setup(void *context, # pragma omp parallel for if(n_rows > CS_THR_MIN) for (cs_lnum_t i = 0; i < n_rows; i++) c->_ad_inv[i] = 1.0 / c->_ad_inv[i]; +#ifdef HAVE_CUDA_OFFLOAD + if ( n_rows > CS_CUDA_GPU_THRESHOLD) + cs_cuda_map_to(c->_ad_inv, n_rows*sizeof(cs_real_t)); +#endif } /*---------------------------------------------------------------------------- @@ -473,11 +478,17 @@ _sles_pc_poly_apply_jacobi(void *context, const cs_real_t *restrict ad_inv = c->ad_inv; if (x_in != NULL) { +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_vector_vc_equal_va_mul_vb(n_rows, x_out, x_in, ad_inv)) +# endif # pragma omp parallel for if(n_rows > CS_THR_MIN) for (cs_lnum_t ii = 0; ii < n_rows; ii++) x_out[ii] = x_in[ii] * ad_inv[ii]; } else { +# ifdef HAVE_CUDA_OFFLOAD + if (!cs_cuda_vector_vc_mul_equal_va(n_rows, x_out, ad_inv)) +# endif # pragma omp parallel for if(n_rows > CS_THR_MIN) for (cs_lnum_t ii = 0; ii < n_rows; ii++) x_out[ii] *= ad_inv[ii]; @@ -567,6 +578,11 @@ _sles_pc_poly_free(void *context) { cs_sles_pc_poly_t *c = context; +#ifdef HAVE_CUDA_OFFLOAD + if (c->n_rows > CS_CUDA_GPU_THRESHOLD) + cs_cuda_map_release(c->_ad_inv, 0); +#endif + c->n_rows = 0; c->n_cols = 0; c->n_aux = 0; diff --git a/src/apps/Makefile.am b/src/apps/Makefile.am index 7c9e6f8dc3..c693b533d0 100644 --- a/src/apps/Makefile.am +++ b/src/apps/Makefile.am @@ -96,6 +96,7 @@ if HAVE_BACKEND AM_CPPFLAGS = \ -I$(top_srcdir)/src/alge \ -I$(top_srcdir)/src/base \ +-I$(top_srcdir)/src/cuda \ -I$(top_srcdir)/src/mesh \ -I$(top_srcdir)/src/elec \ -I$(top_srcdir)/src/cdo \ @@ -126,6 +127,7 @@ $(top_builddir)/src/fvm/libfvm_filters.la \ $(top_builddir)/src/base/libcsbase.la \ $(top_builddir)/src/base/libcscore.la \ $(top_builddir)/src/base/libcsrenumber.la \ +$(top_builddir)/src/cuda/libcscuda.la \ $(top_builddir)/src/pprt/libcspprt.la \ $(top_builddir)/src/alge/libcsalge.la \ $(top_builddir)/src/mesh/libcsmesh.la \ diff --git a/src/apps/cs_solver.c b/src/apps/cs_solver.c index 47776e9f8b..97ee2bba48 100644 --- a/src/apps/cs_solver.c +++ b/src/apps/cs_solver.c @@ -55,6 +55,7 @@ #include "cs_control.h" #include "cs_coupling.h" #include "cs_ctwr.h" +#include "cs_cuda.h" #include "cs_domain_setup.h" #include "cs_fan.h" #include "cs_field.h" @@ -592,6 +593,11 @@ main(int argc, } #endif + // Initialize CUDA. +# ifdef HAVE_CUDA_OFFLOAD + cs_cuda_initialize(); +# endif + /* Default initialization */ #if defined(_CS_ARCH_Linux) @@ -653,6 +659,11 @@ main(int argc, else cs_run(); + // Release CUDA resources. +# ifdef HAVE_CUDA_OFFLOAD + cs_cuda_finalize(); +# endif + /* Return */ cs_exit(EXIT_SUCCESS); diff --git a/src/base/Makefile.am b/src/base/Makefile.am index 5dcab62507..52ca6c8b7d 100644 --- a/src/base/Makefile.am +++ b/src/base/Makefile.am @@ -36,6 +36,7 @@ AM_CPPFLAGS = \ -I$(top_srcdir)/src/alge \ -I$(top_srcdir)/src/cdo \ -I$(top_srcdir)/src/ctwr \ +-I$(top_srcdir)/src/cuda \ -I$(top_srcdir)/src/lagr \ -I$(top_srcdir)/src/mesh \ -I$(top_srcdir)/src/bft \ @@ -61,6 +62,7 @@ $(FCMODINCLUDE)$(top_builddir)/src/cfbl \ $(FCMODINCLUDE)$(top_builddir)/src/cogz \ $(FCMODINCLUDE)$(top_builddir)/src/comb \ $(FCMODINCLUDE)$(top_builddir)/src/ctwr \ +$(FCMODINCLUDE)$(top_builddir)/src/cuda \ $(FCMODINCLUDE)$(top_builddir)/src/darc \ $(FCMODINCLUDE)$(top_builddir)/src/elec \ $(FCMODINCLUDE)$(top_builddir)/src/lagr \ diff --git a/src/base/cs_c_bindings.f90 b/src/base/cs_c_bindings.f90 index 0ac55f2fb2..765ca2ca40 100644 --- a/src/base/cs_c_bindings.f90 +++ b/src/base/cs_c_bindings.f90 @@ -151,6 +151,27 @@ module cs_c_bindings interface + !--------------------------------------------------------------------------- + + ! Interface to C function calling CUDA pinned memory allocator. + subroutine cs_cuda_attempt_host_alloc(ptr, size_val) & + bind(C, name='cs_cuda_attempt_host_alloc') + use, intrinsic :: iso_c_binding + implicit none + type(c_ptr), intent(out) :: ptr + integer(c_int), intent(in), value :: size_val + end subroutine cs_cuda_attempt_host_alloc + + ! Interface to C function calling CUDa memory release. + subroutine cs_cuda_attempt_host_free(ptr) & + bind(C, name='cs_cuda_attempt_host_free') + use, intrinsic :: iso_c_binding + implicit none + type(c_ptr), intent(in), value :: ptr + end subroutine cs_cuda_attempt_host_free + + !============================================================================= + subroutine max_limiter_building(f_id, inc, rovsdt) & bind(C, name='cs_max_limiter_building') use, intrinsic :: iso_c_binding diff --git a/src/base/cs_defs.h b/src/base/cs_defs.h index c715b334c8..2b967bc5f2 100644 --- a/src/base/cs_defs.h +++ b/src/base/cs_defs.h @@ -412,6 +412,15 @@ typedef struct { # define CS_THR_MIN 128 #endif +// Number of elements threshold to actually offload to the GPU. +#define GPU_THRESHOLD 1024 + +// Number of threads to b e used in each CUDA block. +#define GPU_NUMTHD 128 + +// Number of threads that cooperate to get the same result withing a warp. +#define GPU_VECSIZE 8 + /* Cache line size, or multiple thereof */ /*--------------------------------------*/ @@ -545,6 +554,9 @@ cs_align(cs_lnum_t i, /*----------------------------------------------------------------------------*/ +// Maximum value for diagonal block size for blocked operations. +#define DB_SIZE_MAX 8 + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/src/base/cs_equation_iterative_solve.c b/src/base/cs_equation_iterative_solve.c index b62cba5aa9..6762c22879 100644 --- a/src/base/cs_equation_iterative_solve.c +++ b/src/base/cs_equation_iterative_solve.c @@ -52,6 +52,7 @@ #include "bft_printf.h" #include "cs_blas.h" +#include "cs_cuda.h" #include "cs_halo.h" #include "cs_halo_perio.h" #include "cs_log.h" @@ -1179,6 +1180,10 @@ cs_equation_iterative_solve_vector(int idtvar, /* Allocate temporary arrays */ BFT_MALLOC(dam, n_cells_ext, cs_real_33_t); + // dpvar may be needed in the GPU so we get pinned memory for it. +# ifdef HAVE_CUDA_OFFLOAD + if (!(dpvar = cs_cuda_host_alloc(n_cells_ext * sizeof(cs_real_3_t)))) +# endif BFT_MALLOC(dpvar, n_cells_ext, cs_real_3_t); BFT_MALLOC(smbini, n_cells_ext, cs_real_3_t); @@ -1801,6 +1806,9 @@ cs_equation_iterative_solve_vector(int idtvar, BFT_FREE(dam); BFT_FREE(xam); BFT_FREE(smbini); +# ifdef HAVE_CUDA_OFFLOAD + if(!cs_cuda_host_free(dpvar)) +# endif BFT_FREE(dpvar); if (iswdyp >= 1) { BFT_FREE(adxk); diff --git a/src/base/navstv.f90 b/src/base/navstv.f90 index 6413309116..47ee52dd25 100644 --- a/src/base/navstv.f90 +++ b/src/base/navstv.f90 @@ -142,7 +142,8 @@ subroutine navstv & double precision, dimension(:,:), pointer :: velk double precision, allocatable, dimension(:), target :: wvisbi double precision, allocatable, dimension(:), target :: cpro_rho_tc, bpro_rho_tc -double precision, allocatable, dimension(:) :: phi +double precision, pointer, dimension(:) :: phi +type(c_ptr) :: phi_ptr double precision, allocatable, dimension(:) :: w1 double precision, allocatable, dimension(:) :: esflum, esflub double precision, allocatable, dimension(:) :: intflx, bouflx @@ -866,7 +867,8 @@ end subroutine resopv endif ! Allocate temporary arrays for the pressure resolution -allocate(phi(ncelet)) +call cs_cuda_attempt_host_alloc(phi_ptr, ncelet) +call c_f_pointer(phi_ptr, phi, [ncelet]) if (ippmod(icompf).lt.0) then @@ -1658,7 +1660,7 @@ end subroutine resopv ! Free memory deallocate(viscf, viscb) -deallocate(phi) +call cs_cuda_attempt_host_free(phi_ptr) deallocate(trav) deallocate(dfrcxt) deallocate(w1) diff --git a/src/base/predvv.f90 b/src/base/predvv.f90 index f4af4c49fc..bf8374a658 100644 --- a/src/base/predvv.f90 +++ b/src/base/predvv.f90 @@ -212,7 +212,8 @@ subroutine predvv & double precision, allocatable, dimension(:,:) :: eswork double precision, allocatable, dimension(:,:), target :: grad double precision, allocatable, dimension(:,:), target :: hl_exp -double precision, dimension(:,:), allocatable :: smbr +double precision, pointer, dimension(:,:) :: smbr +type(c_ptr) :: smbr_ptr double precision, dimension(:,:,:), allocatable :: fimp double precision, dimension(:,:), allocatable :: gavinj double precision, dimension(:,:), allocatable :: tsexp @@ -299,7 +300,9 @@ subroutine predvv & endif ! Allocate temporary arrays -allocate(smbr(3,ncelet)) +call cs_cuda_attempt_host_alloc(smbr_ptr, 3*ncelet) +call c_f_pointer(smbr_ptr, smbr, [3, ncelet]) + allocate(fimp(3,3,ncelet)) allocate(tsexp(3,ncelet)) allocate(tsimp(3,3,ncelet)) @@ -1739,7 +1742,7 @@ subroutine predvv & ! Free memory !------------ -deallocate(smbr) +call cs_cuda_attempt_host_free(smbr_ptr) deallocate(fimp) deallocate(tsexp) deallocate(tsimp) diff --git a/src/base/resopv.f90 b/src/base/resopv.f90 index df72ecef61..984b4ee2ac 100644 --- a/src/base/resopv.f90 +++ b/src/base/resopv.f90 @@ -210,7 +210,11 @@ subroutine resopv & double precision, allocatable, dimension(:) :: coefap, coefbp, coefa_dp2 double precision, allocatable, dimension(:) :: coefa_rho, coefb_rho double precision, allocatable, dimension(:) :: cofafp, cofbfp, coefaf_dp2 -double precision, allocatable, dimension(:) :: rhs, rovsdt + +double precision, pointer, dimension(:) :: rhs +type(c_ptr) :: rhs_ptr + +double precision, allocatable, dimension(:) :: rovsdt double precision, allocatable, dimension(:) :: hydro_pres double precision, allocatable, dimension(:) :: velflx, velflb, ddphi double precision, allocatable, dimension(:,:) :: coefar, cofafr @@ -262,7 +266,11 @@ subroutine resopv & ! Allocate temporary arrays allocate(dam(ncelet), xam(nfac)) allocate(res(ncelet), phia(ncelet)) -allocate(rhs(ncelet), rovsdt(ncelet)) + +call cs_cuda_attempt_host_alloc(rhs_ptr, ncelet) +call c_f_pointer(rhs_ptr, rhs, [ncelet]) + +allocate(rovsdt(ncelet)) allocate(iflux(nfac), bflux(ndimfb)) allocate(dphi(ncelet)) allocate(trav(3, ncelet)) @@ -2296,7 +2304,8 @@ subroutine resopv & if (allocated(divu)) deallocate(divu) deallocate(gradp) deallocate(coefaf_dp, coefbf_dp) -deallocate(rhs, rovsdt) +call cs_cuda_attempt_host_free(rhs_ptr) +deallocate(rovsdt) if (allocated(weighf)) deallocate(weighf, weighb) if (iswdyp.ge.1) deallocate(adxk, adxkm1, dphim1, rhs0) if (icalhy.eq.1) deallocate(frchy, dfrchy, hydro_pres) diff --git a/src/cuda/Makefile.am b/src/cuda/Makefile.am new file mode 100644 index 0000000000..6ef5c8d881 --- /dev/null +++ b/src/cuda/Makefile.am @@ -0,0 +1,99 @@ +## Process this file with automake to produce Makefile.in + +#------------------------------------------------------------------------------- + +# This file is part of Code_Saturne, a general-purpose CFD tool. +# +# Copyright (C) IBM Corp. 2017, 2018 +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later +# version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., 51 Franklin +# Street, Fifth Floor, Boston, MA 02110-1301, USA. + +#------------------------------------------------------------------------------- + +# New suffixes and targets + +SUFFIXES = .cu .o + +# Main part + +AM_CUDAFLAGS = \ +-I$(top_srcdir)/src/alge \ +-I$(top_srcdir)/src/bft \ +-I$(top_srcdir)/src/fvm \ +-I$(top_srcdir)/src/mei \ +-I$(top_srcdir)/src/base \ +-I$(top_srcdir)/src/mesh \ +-I$(top_srcdir)/src/cdo \ +$(CXXFLAGS) $(CUDAFLAGS) + +AM_LDFLAGS = + +# Public header files (to be installed) + +pkginclude_HEADERS = \ +cs_cuda.h \ +$(top_srcdir)/src/alge/cs_matrix.h + +# Library source files + +noinst_LTLIBRARIES = libcscuda.la + +libcscuda_la_CUDAINCOMPATIBLEFLAGS = \ +-W \ +-Wall \ +-Wshadow \ +-Wpointer-arith \ +-Wcast-qual \ +-Wcast-align \ +-Wwrite-strings \ +-Wunused \ +-Wfloat-equal \ +-Wmisleading-indentation \ +-Wduplicated-cond \ +-fdiagnostics-color=auto + +libcscuda_la_CUDAFLAGS = $(filter-out $(libcscuda_la_CUDAINCOMPATIBLEFLAGS),$(AM_CUDAFLAGS)) +libcscuda_la_SOURCES = \ +cs_cuda.cu + +libcscuda_la_CUDAOBJECTS = $(addprefix libcscuda_la-,$(libcscuda_la_SOURCES:.cu=.lo)) + +libcscuda_la_LDFLAGS = -no-undefined + +# We compile the cuda files with a special rule, so we pass them as dependencies to the +# libtool module. +libcscuda_la_LIBADD = $(libcscuda_la_CUDAOBJECTS) +libcscuda_la_DEPENDENCIES = $(libcscuda_la_CUDAOBJECTS) + +# If we con't support CUDA - compile the CUDA file as C++ code. +if HAVE_CUDA_OFFLOAD +libcscuda_la-%.lo: %.cu $(pkginclude_HEADERS) + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) \ + --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile \ + $(CUDACC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libcscuda_la_CUDAFLAGS) \ + -c -o $@ \ + `test -f "$<" || echo "$(srcdir)/"`$< +else +libcscuda_la-%.lo: %.cu $(pkginclude_HEADERS) + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) \ + --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile \ + $(CXX) -x c++ $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libcscuda_la_CUDAFLAGS) \ + -c -o $@ \ + `test -f "$<" || echo "$(srcdir)/"`$< +endif +clean-local: + -rm -f *__genmod.f90 *__genmod.mod diff --git a/src/cuda/cs_cuda.cu b/src/cuda/cs_cuda.cu new file mode 100644 index 0000000000..1e52146f22 --- /dev/null +++ b/src/cuda/cs_cuda.cu @@ -0,0 +1,1944 @@ +/*============================================================================ + * CUDA offloading support + *============================================================================*/ + +/* + This file is part of Code_Saturne, a general-purpose CFD tool. + + Copyright (C) IBM Corp. 2017, 2018 + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +/*---------------------------------------------------------------------------- + * Standard C/C++ library headers + *----------------------------------------------------------------------------*/ +#include +#include + +#include +#include + +/*---------------------------------------------------------------------------- + * Local headers + *----------------------------------------------------------------------------*/ +#include "cs_cuda.h" +#include "cs_matrix.h" + +#include "bft_mem.h" + +/*---------------------------------------------------------------------------- + * Have Eclipse CDT parser deal with CUDA specific syntax and data types. + *----------------------------------------------------------------------------*/ +#ifdef __CDT_PARSER__ +#define __global__ +#define __device__ +#define __shared__ + +namespace { +dim3 threadIdx; +dim3 blockIdx; +dim3 blockDim; +dim3 gridDim; +int warpSize; + +template +__device__ T __shfl_down_sync(unsigned mask, T var, unsigned int delta, + int width = warpSize); + +void __syncthreads(void); + +double atomicAdd(double *address, double val); +unsigned long long int atomicCAS(unsigned long long int *address, + unsigned long long int compare, + unsigned long long int val); +} // namespace + +#define LAUNCH_KERNEL(...) any_kernels<__VA_ARGS__>() +#else +#define LAUNCH_KERNEL(...) \ + case getKindsID<__VA_ARGS__>(): \ + any_kernels<__VA_ARGS__><<>>(); \ + break +#endif + +/*---------------------------------------------------------------------------- + * Implementation if CUDA is defined + *----------------------------------------------------------------------------*/ +#ifdef HAVE_CUDA_OFFLOAD +#include +#include + +#define CHECK(x) \ + if (cudaError_t err = (x)) { \ + printf("CUDA error in '%s', at line '%d': %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + assert(false); \ + } + +namespace { +// The number of devices available in a node. This is NOT the devices visible +// for the current rank. +int NumberNodeDevices = -1; +// The device ID associated with this rank +int DeviceID = -1; + +// Total number of ranks in current node. +int NumberRanks = 0; + +// Device properties. +cudaDeviceProp DeviceProperties; + +// Function that flushes the argument to the GPU and launch the relevant +// kernels. +void flush(void); + +// Class that manages the memory mapped to the GPU. +class MemoryManager { +public: + // The types of the map. + enum MapType { maptype_alloc, maptype_to, maptype_from, maptype_release }; + + const char *getMapTypeName(MapType Type) const { + switch (Type) { + case maptype_alloc: + return "Alloc"; + case maptype_to: + return "To"; + case maptype_from: + return "From"; + case maptype_release: + return "Release"; + } + return nullptr; + } + +private: + // Information for 1 map + class SlotInfo { + // The pointer to the slot. + char *Begin; + // The pointer to the end of the slot (non-inclusive). + char *End; + // The pointer in the device. + char *DevBegin; + // The number of times the slot was referenced. + size_t RefCount; + + public: + char *getBegin(void) const { return Begin; } + char *getEnd(void) const { return End; } + size_t getSize(void) const { return End - Begin; } + char *getDevBegin(void) const { return DevBegin; } + size_t getCount(void) const { return RefCount; } + void incCount(void) { ++RefCount; } + void decCount(void) { + assert(RefCount > 0 && "This slot ref count is already zero!"); + --RefCount; + } + + SlotInfo() : Begin(nullptr), End(nullptr), DevBegin(nullptr), RefCount(1) {} + SlotInfo(char *Begin, char *End, char *DevBegin) + : Begin(Begin), End(End), DevBegin(DevBegin), RefCount(1) {} + SlotInfo(char *Ptr, char *DevPtr, size_t Size) + : Begin(Ptr), End(Ptr + Size), DevBegin(DevPtr), RefCount(1) {} + }; + + // List with all current maps. + std::list SlotsInfo; + + void dumpSlots(void) { +#ifdef CS_CUDA_DEBUG + printf("Mapped slots:\n"); + for (const auto &S : SlotsInfo) + printf("-> (refcount=%d) %p-%p[:\n", (int)S.getCount(), S.getBegin(), + S.getEnd()); +#endif + } + + // Variables to communicate reduction results. + cs_real_t *ReductionHost = nullptr; + cs_real_t *ReductionDev = nullptr; + + // Variable to control dual buffering of reduction results. + cs_lnum_t ReductionResultVersion = 0; + + // Memory chunk for the current rank to use. + void *GPUStorage = nullptr; + + // + // Lists that keep the used and available memory slots. + // + class GPUStorageSlot { + // Utility to align addresses to 256 bytes. + static uintptr_t AlignPtr(uintptr_t Addr) { + return (Addr + 256 - 1) / 256 * 256; + } + uintptr_t Ptr; + size_t Size; + + public: + uintptr_t getPtr() const { return Ptr; } + uintptr_t getAlignedPtr() const { return AlignPtr(Ptr); } + size_t getSize() const { return Size; } + size_t getAlignedSize() const { + size_t ASize = Size - (getAlignedPtr() - getPtr()); + return (ASize > getSize()) ? 0 : ASize; + } + uintptr_t getBegin() const { return Ptr; } + uintptr_t getEnd() const { return Ptr + Size; } + + GPUStorageSlot(void *Ptr, size_t Size) + : Ptr(reinterpret_cast(Ptr)), Size(Size) {} + GPUStorageSlot(uintptr_t Ptr, size_t Size) : Ptr(Ptr), Size(Size) {} + }; + + std::list GPUStorageUsedSlots; // Sorted by address + std::list GPUStorageFreeSlots; // Sorted by address + + // Malloc and free GPU storage. + void *GPUStorageMalloc(size_t Size) { + flush(); + + // Candidate iterator. + auto FE = GPUStorageFreeSlots.end(); + auto FC = FE; + + // Look for the smallest slot where we can fit the storage. + for (auto FI = GPUStorageFreeSlots.begin(); FI != FE; ++FI) { + // We got a slot! Get the one with the minimum size. + if (FI->getAlignedSize() >= Size) + FC = (FC == FE) ? FI : ((FC->getSize() > FI->getSize()) ? FI : FC); + } + + // No slots, so we have no memory available. + if (FC == FE) { + assert(false && "Run out of GPU memory space."); + return nullptr; + } + + // Create a slot in the used list. Find where to insert + // and emplace it there. + auto UI = GPUStorageUsedSlots.begin(); + for (auto UE = GPUStorageUsedSlots.end(); UI != UE; ++UI) + if (UI->getPtr() > FC->getPtr()) + break; + + uintptr_t AlignPtr = FC->getAlignedPtr(); + GPUStorageUsedSlots.emplace(UI, FC->getPtr(), + Size + AlignPtr - FC->getPtr()); + + // Adjust the slot from the free-slots list. + if (FC->getAlignedSize() == Size) + GPUStorageFreeSlots.erase(FC); + else + *FC = GPUStorageSlot(AlignPtr + Size, FC->getAlignedSize() - Size); + + return reinterpret_cast(AlignPtr); + } + + void GPUStorageFree(void *InPtr) { + flush(); + uintptr_t Ptr = reinterpret_cast(InPtr); + + // Look for the candidate slot the pointer refers too. + auto UC = GPUStorageUsedSlots.begin(); + auto UE = GPUStorageUsedSlots.end(); + for (; UC != UE; ++UC) + if (UC->getAlignedPtr() == Ptr) + break; + + // If we can't find the pointer, we have an invalid pointer to dealloc. + if (UC == UE) { + assert(false && "Can't find GPU storage to deallocate."); + return; + } + + // Look in the free slots list where the free space should be inserted. + auto FI = GPUStorageFreeSlots.begin(); + auto FE = GPUStorageFreeSlots.end(); + for (; FI != FE; ++FI) + if (FI->getPtr() > UC->getPtr()) + break; + + // Check if the slot can be combined with previous free slot. + auto PrevI = (FI != GPUStorageFreeSlots.begin()) ? std::prev(FI) : FE; + if (PrevI != FE && UC->getPtr() == PrevI->getEnd()) { + // Combine with previous slot. + *UC = GPUStorageSlot(PrevI->getPtr(), PrevI->getSize() + UC->getSize()); + GPUStorageFreeSlots.erase(PrevI); + } + + // Check if the slot can be combined with following + if (FI != FE && FI->getPtr() == UC->getEnd()) { + *UC = GPUStorageSlot(UC->getPtr(), UC->getSize() + FI->getSize()); + auto RemoveI = FI; + ++FI; + GPUStorageFreeSlots.erase(RemoveI); + } + + // Move contents of used list into free list. + GPUStorageFreeSlots.splice(GPUStorageFreeSlots.begin(), GPUStorageUsedSlots, + UC); + return; + } + +public: + void initialize(void) { + CHECK(cudaMallocHost((void **)&ReductionHost, 3 * sizeof(cs_real_t))); + CHECK(cudaMalloc((void **)&ReductionDev, 6 * sizeof(cs_real_t))); + CHECK(cudaMemsetAsync(ReductionDev, 0, 6 * sizeof(cs_real_t))) + + // Allocate a chunk of memory to avoid having to repeat malloc/free + // operations in the GPU. We assume that ranks are evenly distributed + // by GPUs. We take 80% of the memory available and split it by ranks. + assert(NumberRanks && NumberNodeDevices); + size_t RanksUsingSameGPU = + (NumberRanks + NumberNodeDevices - 1) / NumberNodeDevices; + size_t MemoryForAllRanks = DeviceProperties.totalGlobalMem * 80ul / 100ul; + size_t MemoryForCurrentRank = MemoryForAllRanks / RanksUsingSameGPU; + + printf(" --> Allocating %lu/%lu bytes in DeviceID: %d\n", + MemoryForCurrentRank, DeviceProperties.totalGlobalMem, DeviceID); + + CHECK(cudaMalloc(&GPUStorage, MemoryForCurrentRank)); + + assert(GPUStorage); + + // We start we a very empty slot. + GPUStorageFreeSlots.emplace_back(GPUStorage, MemoryForCurrentRank); + } + void finalize(void) { + if (ReductionHost) + CHECK(cudaFreeHost(ReductionHost)); + if (ReductionDev) + CHECK(cudaFree(ReductionDev)); + if (GPUStorage) + CHECK(cudaFree(GPUStorage)); + + printf("Have %d used slots and %d free slots\n", GPUStorageUsedSlots.size(), + GPUStorageFreeSlots.size()); + printf("Used Slots:\n"); + for (auto &S : GPUStorageUsedSlots) + printf("---> Used [0x%016lx-0x%016lx[\n", S.getPtr(), S.getEnd()); + printf("Free Slots:\n"); + for (auto &S : GPUStorageFreeSlots) + printf("---> Free [0x%016lx-0x%016lx[\n", S.getPtr(), S.getEnd()); + } + + // Get reduction results. + cs_real_t *getReductionDevPtr(void) const { + assert(ReductionDev && "Reduction results storage is not defined!"); + return ReductionDev; + } + cs_real_t *getReductionResults(void) { + // Copy the data from the GPU from the correct version of the buffer. + cs_real_t *DevPtr = ReductionDev + (ReductionResultVersion * 3); + + flush(); + CHECK(cudaMemcpy(ReductionHost, DevPtr, 3 * sizeof(cs_real_t), + cudaMemcpyDeviceToHost)); + + // Toggle version + ReductionResultVersion ^= 0x01; + + return ReductionHost; + } + cs_lnum_t getReductionVersion(void) const { return ReductionResultVersion; } + + // Map/unmap the specified pointer to the device. Return the device pointer or + // null if the pointer is not valid. All movements are asynchronous unless the + // client specifies the synchronous flag. + void map(const void *InputPointer, size_t Size, MapType Type, + bool Synchronous = false) { + // static int ID = 0; + + // If we don't have devices we don't need to do anything. + if (!NumberNodeDevices) + return; + + char *Pointer = + const_cast(reinterpret_cast(InputPointer)); + + assert(Pointer && "Invalid slot information"); + + char *Begin = Pointer; + char *End = Pointer + Size; + + // + // Detect the provided pointer in the list of maps. + // + auto II = SlotsInfo.begin(); + auto IE = SlotsInfo.end(); + + // Zero size slot. + if (Begin == End) + for (; II != IE && II->getBegin() != Begin; ++II) + ; + // Non-zero size slot. + else + for (; II != IE; ++II) { + // There are no overlaps. + if (Begin >= II->getEnd() || End <= II->getBegin()) + continue; + + // The slot is contained in existing slot. + if (II->getBegin() <= Begin && II->getEnd() >= End) + break; + + // Partial overlap of slot. + printf("Slot [%p-%p[ partially overlaps with [%p-%p[\n", Begin, End, + II->getBegin(), II->getEnd()); + assert(false); + } + + // + // Allocate/deallocate and move data as needed. + // + + // + // Slot exists. + // + if (II != IE) { + if (Type == maptype_alloc || Type == maptype_to) { + II->incCount(); + dumpSlots(); + return; + } + + assert((Type == maptype_from || Type == maptype_release) && + "Unexpected map type."); + + // Is this the last reference? If, so copy the data back and destroy the + // slot. + II->decCount(); + if (!II->getCount()) { + if (Type == maptype_from) { + flush(); + if (Synchronous) { + CHECK(cudaMemcpy(Begin, II->getDevBegin(), II->getSize(), + cudaMemcpyDeviceToHost)); + } else { + CHECK(cudaMemcpyAsync(Begin, II->getDevBegin(), II->getSize(), + cudaMemcpyDeviceToHost)); + } + } + + GPUStorageFree(II->getDevBegin()); + SlotsInfo.erase(II); + } + dumpSlots(); + return; + } + + // + // Slot does not exist. + // + + // Slot does not exist so there is no point in releasing it. + if (Type == maptype_release) { + dumpSlots(); + return; + } + + assert((Type == maptype_alloc || Type == maptype_to) && + "Can't move from nonexistent slot."); + char *DevicePtr = reinterpret_cast(GPUStorageMalloc(Size)); + + assert(DevicePtr && "Malloc returned invalid device pointer."); + + if (Type == maptype_to) { + if (Synchronous) { + flush(); + CHECK(cudaMemcpy(DevicePtr, Begin, Size, cudaMemcpyHostToDevice)); + } else { + CHECK(cudaMemcpyAsync(DevicePtr, Begin, Size, cudaMemcpyHostToDevice)); + } + } + + SlotsInfo.emplace_front(Begin, End, DevicePtr); + + dumpSlots(); + return; + } + + // Return the device pointer for the specified host pointer. + void *getDevicePtr(const void *InputHostPointer, + bool MustExist = true) const { + char *HostPointer = + const_cast(reinterpret_cast(InputHostPointer)); + + assert(HostPointer && "Invalid host pointer!"); + + for (auto &S : SlotsInfo) { + if (HostPointer >= S.getBegin() && HostPointer < S.getEnd()) { + char *DevPtr = S.getDevBegin(); + assert(DevPtr && "Invalid device pointer!"); + // Return the device pointer. We need to use the same offset the host + // pointer is using. + return DevPtr + (HostPointer - S.getBegin()); + } + } + + assert(!MustExist && "No device pointer found for specified host pointer."); + return nullptr; + } + + template + T *getDevicePtr(T *InputHostPointer, bool MustExist = true) const { + const T *HostPointer = InputHostPointer; + return reinterpret_cast( + getDevicePtr(reinterpret_cast(HostPointer), MustExist)); + } +}; + +MemoryManager MM; + +// Number of threads that cooperate in the computation of one vector element. +// Using too little will reduce the number of coalesced accessed while accessing +// the indexes. Using too much will result in wasting resources, and some +// threads won't have anything to do. This number should be around the average +// number of non zero elements in each row of the input matrix. Must divide 32. +#define CUDA_GPU_GROUPSIZE (8) +// Minimum number of elements (rows) a group takes care of. This will influence +// how many blocks will run overall. +#define CUDA_GPU_MIN_ROWS_PER_GROUP (8) +// Number of threads per block. We should have enough threads for the block to +// be busy, but we should not use too much, otherwise it will decrease the +// likelihood of getting more blocks being scheduled for the same SM. Must be a +// multiple of 32. +#define CUDA_GPU_NUMTHD (128) +#define CUDA_GPU_GROUPS (CUDA_GPU_NUMTHD / CUDA_GPU_GROUPSIZE) +#define CUDA_GPU_WARP_SIZE 32 + +unsigned setBlockAndGridDimensions(cs_lnum_t n_rows, dim3 &blockSize, + dim3 &gridSize) { + unsigned NumGroups = CUDA_GPU_GROUPS; + unsigned MinRowsPerGroup = CUDA_GPU_MIN_ROWS_PER_GROUP; + + // Target number of rows per block. + unsigned RowsPerBlock = MinRowsPerGroup * NumGroups; + + // Required number of blocks - ceil(n_rows/RowsPerBlock). + gridSize = {(n_rows + RowsPerBlock - 1) / RowsPerBlock, 1, 1}; + + // If it does not meet the requirements, we have to adjust the number of + // rows per group. + if (gridSize.x > DeviceProperties.maxGridSize[0]) { + uint64_t AdjustedRowsPerGroup = (((uint64_t)MinRowsPerGroup * gridSize.x) + + DeviceProperties.maxGridSize[0] - 1) / + DeviceProperties.maxGridSize[0]; + + RowsPerBlock = AdjustedRowsPerGroup * NumGroups; + gridSize.x = (n_rows + RowsPerBlock - 1) / RowsPerBlock; + } + + assert(gridSize.x <= DeviceProperties.maxGridSize[0] && + "Error in adjusting number of rows per block."); + blockSize = {CUDA_GPU_NUMTHD, 1, 1}; + return RowsPerBlock; +} + +// Matrix formats used in matrix vector multiplications. +enum KernelKinds { + // Matrix-vector: + MV_CSR, + MV_MSR, + MV_CSR_No_Diag, + MV_MSR_No_Diag, + MV_Seidel, + MV_Seidel_With_Red, + // Dot product: + DP_xx, + DP_xx_xy, + DP_xy_yz, + // Vector operations: + VO_vc_equal_zero, + VO_vc_equal_va, + VO_vc_sub_equal_va, + VO_vc_mul_equal_va, + VO_vc_equal_va_mul_vb, + VO_vc_add_equal_s_mul_vb, + VO_2x_vc_add_equal_s_mul_vb, + VO_vc_equal_va_add_s_mul_vb, + + // Invalid kind. + InvalidKernelKind, +}; + +const char *getKernelKindName(KernelKinds Kind) { + switch (Kind) { + // Matrix-vector: + case MV_CSR: + return "MV_CSR"; + case MV_MSR: + return "MV_MSR"; + case MV_CSR_No_Diag: + return "MV_CSR_No_Diag"; + case MV_MSR_No_Diag: + return "MV_MSR_No_Diag"; + case MV_Seidel: + return "MV_Seidel"; + case MV_Seidel_With_Red: + return "MV_Seidel_With_Red"; + // Dot product: + case DP_xx: + return "DP_xx"; + case DP_xx_xy: + return "DP_xx_xy"; + case DP_xy_yz: + return "DP_xy_yz"; + // Vector operations: + case VO_vc_equal_zero: + return "VO_vc_equal_zero"; + case VO_vc_equal_va: + return "VO_vc_equal_va"; + case VO_vc_sub_equal_va: + return "VO_vc_sub_equal_va"; + case VO_vc_mul_equal_va: + return "VO_vc_mul_equal_va"; + case VO_vc_equal_va_mul_vb: + return "VO_vc_equal_va_mul_vb"; + case VO_vc_add_equal_s_mul_vb: + return "VO_vc_add_equal_s_mul_vb"; + case VO_2x_vc_add_equal_s_mul_vb: + return "VO_2x_vc_add_equal_s_mul_vb"; + case VO_vc_equal_va_add_s_mul_vb: + return "VO_vc_equal_va_add_s_mul_vb"; + // Invalid kind. + case InvalidKernelKind: + return "InvalidKernelKind"; + } + return nullptr; +} + +// Kernel to perform matrix-vector multiplication. +template +__device__ void matrix_vector_multiplication( + const cs_lnum_t *restrict dev_row_index, + const cs_lnum_t *restrict dev_col_id, const cs_real_t *restrict dev_val, + const cs_real_t *restrict dev_d_val, const cs_real_t *restrict dev_x, + cs_real_t *restrict dev_y, cs_lnum_t n_rows, cs_lnum_t n_cols, + cs_lnum_t n_rows_per_block) { + + const unsigned bdimx = CUDA_GPU_GROUPSIZE; + const unsigned bdimy = CUDA_GPU_GROUPS; + const unsigned tidx = threadIdx.x % CUDA_GPU_GROUPSIZE; + const unsigned tidy = threadIdx.x / CUDA_GPU_GROUPSIZE; + + // We expect a 2D block, where y is the row a group of threads cooperate on, + // and x is the ID of each thread working on the same resulting vector + // element. + const cs_lnum_t StartRow = n_rows_per_block * blockIdx.x + tidy; + const cs_lnum_t EndRowExp = StartRow + n_rows_per_block; + const cs_lnum_t EndRow = (n_rows < EndRowExp) ? n_rows : EndRowExp; + + for (cs_lnum_t ii = StartRow; ii < EndRow; ii += bdimy) { + unsigned AM = __activemask(); + const cs_lnum_t r0 = dev_row_index[ii]; + const cs_lnum_t r1 = dev_row_index[ii + 1]; + + const cs_lnum_t *restrict col_id = dev_col_id + r0; + const cs_real_t *restrict m_row = dev_val + r0; + const cs_lnum_t n_cols = r1 - r0; + cs_real_t sii = 0.0; + + for (cs_lnum_t jj = tidx; jj < n_cols; jj += bdimx) + if (Kind == MV_MSR || Kind == MV_MSR_No_Diag || Kind == MV_CSR || + col_id[jj] != ii) + sii += (m_row[jj] * dev_x[col_id[jj]]); + + for (cs_lnum_t kk = 1; kk < bdimx; kk *= 2) + sii += __shfl_down_sync(AM, sii, kk, bdimx); + + if (!tidx) { + + if (Kind == MV_MSR) + sii += dev_d_val[ii] * dev_x[ii]; + + dev_y[ii] = sii; + } + } + return; +} + +#if (__CUDA_ARCH__ < 600) +// Atomic double add for older GPUs. +__device__ double oldAtomicAdd(double *address, double val) { + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} +#endif + +__device__ void _fw_and_bw_lu_gs(const cs_real_t mat[], int db_size, + cs_real_t x[], const cs_real_t b[]) { + + /* forward */ + for (int ii = 0; ii < db_size; ii++) { + x[ii] = b[ii]; + for (int jj = 0; jj < ii; jj++) + x[ii] -= x[jj] * mat[ii * db_size + jj]; + } + + /* backward */ + for (int ii = db_size - 1; ii >= 0; ii--) { + for (int jj = db_size - 1; jj > ii; jj--) + x[ii] -= x[jj] * mat[ii * db_size + jj]; + x[ii] /= mat[ii * (db_size + 1)]; + } +} + +#if (DB_SIZE_MAX > CUDA_GPU_GROUPSIZE) +# error MSR block kernels need the block size to be compatible with number of CUDA group threads. +#endif +#undef DB_SIZE_MAX +template +__device__ void matrix_vector_seidel_block( + const cs_lnum_t *restrict dev_row_index, + const cs_lnum_t *restrict dev_col_id, const cs_real_t *restrict dev_val, + const cs_real_t *restrict dev_ad_inv, const cs_real_t *restrict dev_ad, + const cs_real_t *restrict dev_rhs, cs_real_t *restrict dev_vx, + cs_lnum_t dev_red_version, cs_real_t *restrict dev_res2x, + cs_lnum_t diag_block_size, cs_lnum_t dev_db0, cs_lnum_t dev_db1, + cs_lnum_t dev_db2, cs_lnum_t dev_db3, cs_lnum_t n_rows, cs_lnum_t n_cols_in, + cs_lnum_t n_rows_per_block) { + + cs_real_t *restrict res = dev_res2x + (dev_red_version * 3); + cs_real_t *restrict res_to_clear = dev_res2x + ((dev_red_version ^ 0x01) * 3); + + const unsigned bdimx = CUDA_GPU_GROUPSIZE; + const unsigned bdimy = CUDA_GPU_GROUPS; + const unsigned tidx = threadIdx.x % CUDA_GPU_GROUPSIZE; + const unsigned tidy = threadIdx.x / CUDA_GPU_GROUPSIZE; + + // Shared array containing the ad_inv coefficients. + __shared__ cs_real_t + shared_ad_inv_storage[(DiagBlockSize * DiagBlockSize) * CUDA_GPU_GROUPS]; + cs_real_t *shared_ad_inv = + shared_ad_inv_storage + tidy * (DiagBlockSize * DiagBlockSize); + + // Each thread will have its local vx for the matrix operations. + cs_real_t local_vx0[DiagBlockSize]; + cs_real_t local_vx[DiagBlockSize]; + + // Arrays to enable coalesced loading of arrays from memory. + __shared__ cs_real_t shared_vxm1_storage[DiagBlockSize * CUDA_GPU_GROUPS]; + __shared__ cs_real_t shared_ad_storage[DiagBlockSize * CUDA_GPU_GROUPS]; + cs_real_t *shared_vxm1 = shared_vxm1_storage + tidy * DiagBlockSize; + cs_real_t *shared_ad = shared_ad_storage + tidy * DiagBlockSize; + + // Shared memory to help in block reduction. + __shared__ cs_real_t shared_res[CUDA_GPU_NUMTHD / CUDA_GPU_WARP_SIZE]; + + // We expect a 2D block, where y is the row a group of threads cooperate on, + // and x is the ID of each thread working on the same resulting vector + // element. + const cs_lnum_t StartRow = n_rows_per_block * blockIdx.x + tidy; + const cs_lnum_t EndRowExp = StartRow + n_rows_per_block; + const cs_lnum_t EndRow = (n_rows < EndRowExp) ? n_rows : EndRowExp; + + // Zero the buffer for next reduction. + if (Kind == MV_Seidel_With_Red && !tidx && StartRow < 3) + res_to_clear[StartRow] = 0.0; + + cs_real_t local_res = 0.0; + + for (cs_lnum_t iit = StartRow; iit < EndRow; iit += bdimy) { + unsigned AM = __activemask(); + + cs_lnum_t ii = iit; + // Version with reduction uses a reverse scan. + if (Kind == MV_Seidel_With_Red) + ii = n_rows - iit - 1; + + const cs_lnum_t r0 = dev_row_index[ii]; + const cs_lnum_t r1 = dev_row_index[ii + 1]; + + const cs_lnum_t *restrict col_id = dev_col_id + r0; + const cs_real_t *restrict m_row = dev_val + r0; + const cs_lnum_t n_cols = r1 - r0; + + // Initialize the local vx0 and vxm1 to zero. + for (auto &v : local_vx0) + v = 0; + + // Get RHS into vx0. + for (cs_lnum_t kk = tidx; kk < diag_block_size; kk += bdimx) { + local_vx0[kk] = dev_rhs[ii * dev_db1 + kk]; + + if (Kind == MV_Seidel_With_Red) { + shared_vxm1[kk] = dev_vx[ii * dev_db1 + kk]; + shared_ad[kk] = dev_ad[ii * dev_db1 + kk]; + } + } + + // Load ad_inv coefficients. + for (cs_lnum_t kk = tidx; kk < dev_db3; kk += bdimx) + shared_ad_inv[kk] = *(dev_ad_inv + dev_db3 * ii + kk); + + // Perform matrix vector operations. dev_vx access is not coalesced. + for (cs_lnum_t jj = tidx; jj < n_cols; jj += bdimx) { + // Coalesced accesses for the column and row arrays. + const auto row = m_row[jj]; + const auto col = col_id[jj]; + for (cs_lnum_t kk = 0; kk < diag_block_size; kk++) + local_vx0[kk] -= (row * dev_vx[col * dev_db1 + kk]); + } + + // Make a reduction of local vx0. + for (cs_lnum_t jj = 1; jj < bdimx; jj *= 2) + for (cs_lnum_t kk = 0; kk < diag_block_size; kk += 1) + local_vx0[kk] += __shfl_down_sync(AM, local_vx0[kk], jj, bdimx); + + // Master thread of each group does the forward/backward compute and store + // the result to memory. + if (!tidx) { + + _fw_and_bw_lu_gs(shared_ad_inv, dev_db0, local_vx, local_vx0); + + for (cs_lnum_t kk = 0; kk < diag_block_size; kk += 1) { + if (Kind == MV_Seidel_With_Red) { + register cs_real_t r = + shared_ad[kk] * (local_vx[kk] - shared_vxm1[kk]); + local_res += (r * r); + } + + dev_vx[ii * dev_db1 + kk] = local_vx[kk]; + } + } + } + + // Reduce the result across groups, warps and blocks. + if (Kind == MV_Seidel_With_Red) { + // Make a reduction across groups - we have a result for each group. + for (cs_lnum_t kk = CUDA_GPU_GROUPSIZE; kk < CUDA_GPU_WARP_SIZE; kk *= 2) + local_res += __shfl_down_sync(0xffffffff, local_res, kk); + + const unsigned numWarps = blockDim.x / CUDA_GPU_WARP_SIZE; + const unsigned laneID = threadIdx.x % CUDA_GPU_WARP_SIZE; + const unsigned warpID = threadIdx.x / CUDA_GPU_WARP_SIZE; + + // First lane of each warp record its contribution to shared memory. + if (!laneID) + shared_res[warpID] = local_res; + + __syncthreads(); + + // We only need the first warp from now on. + if (warpID) + return; + + // Each thread in the warp picks one element from shared memory. + local_res = 0.0; + if (laneID < numWarps) + local_res = shared_res[laneID]; + + // Make a reduction across the warp, but now we only need to cover numWarps. + for (cs_lnum_t kk = 1; kk < numWarps; kk *= 2) + local_res += __shfl_down_sync(0xffffffff, local_res, kk); + + // Write results atomically to memory. + if (!laneID) { +#if (__CUDA_ARCH__ < 600) + oldAtomicAdd(res + 0, local_res); +#else + atomicAdd(res + 0, local_res); +#endif + } + } + + return; +} + +// Kernel to perform dot products. +template +__device__ void +dot_product(cs_lnum_t version, cs_lnum_t n_rows, const cs_real_t *restrict x, + const cs_real_t *restrict y, const cs_real_t *restrict z, + cs_real_t *restrict res2x, cs_lnum_t n_rows_per_block) { + + cs_real_t *restrict res = res2x + (version * 3); + cs_real_t *restrict res_to_clear = res2x + ((version ^ 0x01) * 3); + + // We use 1 shared-memory element per warp. + __shared__ cs_real_t sh_xx[CUDA_GPU_NUMTHD / CUDA_GPU_WARP_SIZE]; + __shared__ cs_real_t sh_xy[CUDA_GPU_NUMTHD / CUDA_GPU_WARP_SIZE]; + __shared__ cs_real_t sh_yz[CUDA_GPU_NUMTHD / CUDA_GPU_WARP_SIZE]; + + // We expect a 1D block. + const cs_lnum_t StartRow = n_rows_per_block * blockIdx.x + threadIdx.x; + const cs_lnum_t EndRowExp = StartRow + n_rows_per_block; + const cs_lnum_t EndRow = (n_rows < EndRowExp) ? n_rows : EndRowExp; + + // Zero the buffer for next reduction. + if (StartRow < 3) + res_to_clear[threadIdx.x] = 0.0; + + cs_real_t xx = 0.0, xy = 0.0, yz = 0.0; + + for (cs_lnum_t ii = StartRow; ii < EndRow; ii += blockDim.x) { + cs_real_t xi, yi, zi; + + xi = x[ii]; + if (Kind == DP_xx_xy || Kind == DP_xy_yz) + yi = y[ii]; + if (Kind == DP_xy_yz) + zi = z[ii]; + + if (Kind == DP_xx || Kind == DP_xx_xy) + xx += xi * xi; + if (Kind == DP_xx_xy || Kind == DP_xy_yz) + xy += xi * yi; + if (Kind == DP_xy_yz) + yz += yi * zi; + } + + // Make a reduction across the warp + for (cs_lnum_t kk = 1; kk < CUDA_GPU_WARP_SIZE; kk *= 2) { + if (Kind == DP_xx || Kind == DP_xx_xy) + xx += __shfl_down_sync(0xffffffff, xx, kk); + if (Kind == DP_xx_xy || Kind == DP_xy_yz) + xy += __shfl_down_sync(0xffffffff, xy, kk); + if (Kind == DP_xy_yz) + yz += __shfl_down_sync(0xffffffff, yz, kk); + } + + const unsigned numWarps = CUDA_GPU_NUMTHD / CUDA_GPU_WARP_SIZE; + const unsigned laneID = threadIdx.x % CUDA_GPU_WARP_SIZE; + const unsigned warpID = threadIdx.x / CUDA_GPU_WARP_SIZE; + + // First lane of each warp record its contribution to shared memory. + if (!laneID) { + if (Kind == DP_xx || Kind == DP_xx_xy) + sh_xx[warpID] = xx; + if (Kind == DP_xx_xy || Kind == DP_xy_yz) + sh_xy[warpID] = xy; + if (Kind == DP_xy_yz) + sh_yz[warpID] = yz; + } + + __syncthreads(); + + // We only need the first warp from now on. + if (warpID) + return; + + // Each thread in the warp picks one element from shared memory. + xx = 0.0, xy = 0.0, yz = 0.0; + if (laneID < numWarps) { + if (Kind == DP_xx || Kind == DP_xx_xy) + xx = sh_xx[laneID]; + if (Kind == DP_xx_xy || Kind == DP_xy_yz) + xy = sh_xy[laneID]; + if (Kind == DP_xy_yz) + yz = sh_yz[laneID]; + } + + // Make a reduction across the warp, but now we only need to cover numWarps. + for (cs_lnum_t kk = 1; kk < numWarps; kk *= 2) { + if (Kind == DP_xx || Kind == DP_xx_xy) + xx += __shfl_down_sync(0xffffffff, xx, kk); + if (Kind == DP_xx_xy || Kind == DP_xy_yz) + xy += __shfl_down_sync(0xffffffff, xy, kk); + if (Kind == DP_xy_yz) + yz += __shfl_down_sync(0xffffffff, yz, kk); + } + + // Write results atomically to memory. + if (!laneID) { +#if (__CUDA_ARCH__ < 600) + if (Kind == DP_xx || Kind == DP_xx_xy) + oldAtomicAdd(res + 0, xx); + if (Kind == DP_xx_xy || Kind == DP_xy_yz) + oldAtomicAdd(res + 1, xy); + if (Kind == DP_xy_yz) + oldAtomicAdd(res + 2, yz); +#else + if (Kind == DP_xx || Kind == DP_xx_xy) + atomicAdd(res + 0, xx); + if (Kind == DP_xx_xy || Kind == DP_xy_yz) + atomicAdd(res + 1, xy); + if (Kind == DP_xy_yz) + atomicAdd(res + 2, yz); +#endif + } + + return; +} + +// Kernel to perform vector operations. +template +__device__ void +vector_operation(cs_lnum_t n_rows, cs_real_t s, cs_real_t *restrict vc1, + cs_real_t *restrict vc2, const cs_real_t *restrict va, + const cs_real_t *restrict vb1, const cs_real_t *restrict vb2, + cs_lnum_t n_rows_per_block) { + + // We expect a 1D block. + const cs_lnum_t StartRow = n_rows_per_block * blockIdx.x + threadIdx.x; + const cs_lnum_t EndRowExp = StartRow + n_rows_per_block; + const cs_lnum_t EndRow = (n_rows < EndRowExp) ? n_rows : EndRowExp; + + for (cs_lnum_t ii = StartRow; ii < EndRow; ii += blockDim.x) { + switch (Kind) { + case VO_vc_equal_zero: + vc1[ii] = 0.0; + break; + case VO_vc_equal_va: + vc1[ii] = va[ii]; + break; + case VO_vc_sub_equal_va: + vc1[ii] -= va[ii]; + break; + case VO_vc_mul_equal_va: + vc1[ii] *= va[ii]; + break; + case VO_vc_equal_va_mul_vb: + vc1[ii] = va[ii] * vb1[ii]; + break; + case VO_vc_add_equal_s_mul_vb: + vc1[ii] += s * vb1[ii]; + break; + case VO_2x_vc_add_equal_s_mul_vb: + vc1[ii] += s * vb1[ii]; + vc2[ii] += s * vb2[ii]; + break; + case VO_vc_equal_va_add_s_mul_vb: + vc1[ii] = va[ii] + (s * vb1[ii]); + break; + } + } + + return; +} + +#define CS_CUDA_GPU_MAXIMUM_ARGUMENTS_PER_KERNEL 16 +#define CS_CUDA_GPU_MAXIMUM_NUM_KERNELS 5 + +// Class that manages the arguments for a kernel. +class KernelArgsBase { + // Union to track the size we need to fit all the types we use as argument. + union KernelArgType { + cs_lnum_t lnum_t; + cs_lnum_t *lnum_t_ptr; + cs_real_t real_t; + cs_real_t *real_t_ptr; + }; + + const KernelKinds KernelKind; + size_t NumberOfArgs = 0; + KernelArgType Args[CS_CUDA_GPU_MAXIMUM_ARGUMENTS_PER_KERNEL]; + +public: + template int setArg(ArgT Arg) { + + assert(NumberOfArgs < CS_CUDA_GPU_MAXIMUM_ARGUMENTS_PER_KERNEL && + "Storing too many args."); + + KernelArgType *Storage = &Args[NumberOfArgs++]; + *reinterpret_cast(Storage) = Arg; + return 0; + } + + KernelArgsBase(const KernelKinds KernelKind) : KernelKind(KernelKind) {} + KernelArgsBase() : KernelKind(InvalidKernelKind) {} + + template __host__ __device__ ArgT getArg(int Index) { + + assert(Index < CS_CUDA_GPU_MAXIMUM_ARGUMENTS_PER_KERNEL && + "Storing too many args."); + + KernelArgType *Storage = &Args[Index]; + return *reinterpret_cast(Storage); + } + + __host__ __device__ KernelKinds getKind(void) const { return KernelKind; } +}; + +template struct KernelArgs : public KernelArgsBase { + template KernelArgs(T... InputArgs) : KernelArgsBase(Kind) { + int expand[] = {0, setArg(InputArgs)...}; + (void)expand; + } +}; + +// Class to encode arguments for a series of kernels +struct KernelArgsSeries { + dim3 GridDim; + dim3 BlockDim; + unsigned RowsPerBlock = 0; + int NumKernels = 0; + KernelArgsBase Args[CS_CUDA_GPU_MAXIMUM_NUM_KERNELS]; + + // Add bundle of arguments. + template + void add(cs_lnum_t n_rows, T... args) { + + assert(NumKernels < CS_CUDA_GPU_MAXIMUM_NUM_KERNELS && + "Not expecting to deal with such a long series of kernels."); + + // Create launching dimensions. + if (!NumKernels) + RowsPerBlock = setBlockAndGridDimensions(n_rows, BlockDim, GridDim); + + // Set the new arguments in place. + new (&Args[NumKernels++]) KernelArgs(args...); + } + + // Transfer and execute in the GPU. + void flush(void); + + // Default Ctor + KernelArgsSeries() {} + +private: + // Utility function that returns a 64-bit integer ID for a list of kinds. This + // assumes a kernel can't have more than 5 kinds. + template + static constexpr uint64_t getKindsID(uint64_t ID = 0) { + return (FirstKind == InvalidKernelKind) + ? ID + : getKindsID((ID << 12) | + ((unsigned)FirstKind & 0xfff)); + } +}; + +KernelArgsSeries *KernelArgsSeriesHostVersions = nullptr; +KernelArgsSeries *KernelArgsSeriesHost = nullptr; +// We declare the device side as a char to prevent dynamic initialisation. +__constant__ char KernelArgsSeriesGPU[sizeof(KernelArgsSeries)]; + +template +__device__ int any_kernel(KernelArgsBase &Arg, unsigned n_rows_per_block) { + switch (Kind) { + // Matrix-vector: + case MV_CSR_No_Diag: + case MV_CSR: + case MV_MSR_No_Diag: + matrix_vector_multiplication( + /* dev_row_index */ Arg.getArg(0), + /* dev_col_id */ Arg.getArg(1), + /* dev_val */ Arg.getArg(2), + /* dev_d_val */ nullptr, + /* dev_x */ Arg.getArg(3), + /* dev_y */ Arg.getArg(4), + /* n_rows */ Arg.getArg(5), + /* n_cols */ Arg.getArg(6), + /* n_rows_per_block */ n_rows_per_block); + break; + case MV_MSR: + matrix_vector_multiplication( + /* dev_row_index */ Arg.getArg(0), + /* dev_col_id */ Arg.getArg(1), + /* dev_val */ Arg.getArg(2), + /* dev_d_val */ Arg.getArg(3), + /* dev_x */ Arg.getArg(4), + /* dev_y */ Arg.getArg(5), + /* n_rows */ Arg.getArg(6), + /* n_cols */ Arg.getArg(7), + /* n_rows_per_block */ n_rows_per_block); + break; + case MV_Seidel: + matrix_vector_seidel_block( + /* dev_row_index */ Arg.getArg(0), + /* dev_col_id */ Arg.getArg(1), + /* dev_val */ Arg.getArg(2), + /* dev_ad_inv */ Arg.getArg(3), + /* dev_ad */ nullptr, + /* dev_rhs */ Arg.getArg(4), + /* dev_vx */ Arg.getArg(5), + /* dev_red_version */ 0, + /* dev_red */ nullptr, + /* diag_block_size */ Arg.getArg(6), + /* dev_db0 */ Arg.getArg(7), + /* dev_db1 */ Arg.getArg(8), + /* dev_db2 */ Arg.getArg(9), + /* dev_db3 */ Arg.getArg(10), + /* n_rows */ Arg.getArg(11), + /* n_cols */ Arg.getArg(12), + /* n_rows_per_block */ n_rows_per_block); + break; + case MV_Seidel_With_Red: + matrix_vector_seidel_block( + /* dev_row_index */ Arg.getArg(0), + /* dev_col_id */ Arg.getArg(1), + /* dev_val */ Arg.getArg(2), + /* dev_ad_inv */ Arg.getArg(3), + /* dev_ad */ Arg.getArg(4), + /* dev_rhs */ Arg.getArg(5), + /* dev_vx */ Arg.getArg(6), + /* dev_red_version */ Arg.getArg(7), + /* dev_red */ Arg.getArg(8), + /* diag_block_size */ Arg.getArg(9), + /* dev_db0 */ Arg.getArg(10), + /* dev_db1 */ Arg.getArg(11), + /* dev_db2 */ Arg.getArg(12), + /* dev_db3 */ Arg.getArg(13), + /* n_rows */ Arg.getArg(14), + /* n_cols */ Arg.getArg(15), + /* n_rows_per_block */ n_rows_per_block); + break; + // + // Dot product: + // + case DP_xx: + dot_product( + /* version */ Arg.getArg(0), + /* n_rows */ Arg.getArg(1), + /* x */ Arg.getArg(2), + /* y */ nullptr, + /* z */ nullptr, + /* res */ Arg.getArg(3), + /* n_rows_per_block */ n_rows_per_block); + break; + case DP_xx_xy: + dot_product( + /* version */ Arg.getArg(0), + /* n_rows */ Arg.getArg(1), + /* x */ Arg.getArg(2), + /* y */ Arg.getArg(3), + /* z */ nullptr, + /* res */ Arg.getArg(4), + /* n_rows_per_block */ n_rows_per_block); + break; + case DP_xy_yz: + dot_product( + /* version */ Arg.getArg(0), + /* n_rows */ Arg.getArg(1), + /* x */ Arg.getArg(2), + /* y */ Arg.getArg(3), + /* z */ Arg.getArg(4), + /* res */ Arg.getArg(5), + /* n_rows_per_block */ n_rows_per_block); + break; + // + // Vector operations: + // + case VO_vc_equal_zero: + vector_operation( + /* n_rows */ Arg.getArg(0), + /* s */ 0.0, + /* vc1 */ Arg.getArg(1), + /* vc2 */ nullptr, + /* va */ nullptr, + /* vb1 */ nullptr, + /* vb2 */ nullptr, + /* n_rows_per_block */ n_rows_per_block); + break; + case VO_vc_equal_va: + case VO_vc_sub_equal_va: + case VO_vc_mul_equal_va: + vector_operation( + /* n_rows */ Arg.getArg(0), + /* s */ 0.0, + /* vc1 */ Arg.getArg(1), + /* vc2 */ nullptr, + /* va */ Arg.getArg(2), + /* vb1 */ nullptr, + /* vb2 */ nullptr, + /* n_rows_per_block */ n_rows_per_block); + break; + case VO_vc_equal_va_mul_vb: + vector_operation( + /* n_rows */ Arg.getArg(0), + /* s */ 0.0, + /* vc1 */ Arg.getArg(1), + /* vc2 */ nullptr, + /* va */ Arg.getArg(2), + /* vb1 */ Arg.getArg(3), + /* vb2 */ nullptr, + /* n_rows_per_block */ n_rows_per_block); + break; + case VO_vc_add_equal_s_mul_vb: + vector_operation( + /* n_rows */ Arg.getArg(0), + /* s */ Arg.getArg(1), + /* vc1 */ Arg.getArg(2), + /* vc2 */ nullptr, + /* va */ nullptr, + /* vb1 */ Arg.getArg(3), + /* vb2 */ nullptr, + /* n_rows_per_block */ n_rows_per_block); + break; + case VO_2x_vc_add_equal_s_mul_vb: + vector_operation( + /* n_rows */ Arg.getArg(0), + /* s */ Arg.getArg(1), + /* vc1 */ Arg.getArg(2), + /* vc2 */ Arg.getArg(3), + /* va */ nullptr, + /* vb1 */ Arg.getArg(4), + /* vb2 */ Arg.getArg(5), + /* n_rows_per_block */ n_rows_per_block); + break; + case VO_vc_equal_va_add_s_mul_vb: + vector_operation( + /* n_rows */ Arg.getArg(0), + /* s */ Arg.getArg(1), + /* vc1 */ Arg.getArg(2), + /* vc2 */ nullptr, + /* va */ Arg.getArg(3), + /* vb1 */ Arg.getArg(4), + /* vb2 */ nullptr, + /* n_rows_per_block */ n_rows_per_block); + break; + } + __syncthreads(); + return 0; +} + +template __global__ void any_kernels(void) { + + auto *KA = reinterpret_cast(&KernelArgsSeriesGPU[0]); + const unsigned n_rows_per_block = KA->RowsPerBlock; + unsigned idx = 0; + + int dummy[] = {any_kernel(KA->Args[idx++], n_rows_per_block)...}; + (void)dummy; +} + +void KernelArgsSeries::flush(void) { + // Nothing to flush. + if (!NumKernels) + return; + + assert(KernelArgsSeriesHost == this && + "Not expecting to flush this version."); + + CHECK(cudaMemcpyToSymbolAsync(KernelArgsSeriesGPU, this, + sizeof(KernelArgsSeries), 0, + cudaMemcpyHostToDevice)); + + uint64_t ID = 0; + for (unsigned idx = 0; idx < NumKernels; ++idx) + ID = (ID << 12) | ((unsigned)Args[idx].getKind() & 0xfff); + + switch (ID) { + LAUNCH_KERNEL(DP_xx); + LAUNCH_KERNEL(DP_xx_xy); + LAUNCH_KERNEL(MV_Seidel); + LAUNCH_KERNEL(MV_Seidel_With_Red); + LAUNCH_KERNEL(MV_CSR, VO_vc_sub_equal_va, VO_vc_equal_va_mul_vb, + VO_vc_equal_va, DP_xx_xy); + LAUNCH_KERNEL(MV_CSR, DP_xy_yz); + LAUNCH_KERNEL(VO_2x_vc_add_equal_s_mul_vb, DP_xx); + LAUNCH_KERNEL(VO_vc_equal_va_mul_vb, DP_xx_xy); + LAUNCH_KERNEL(VO_vc_equal_va_add_s_mul_vb); + LAUNCH_KERNEL(VO_2x_vc_add_equal_s_mul_vb, VO_vc_equal_va_mul_vb, DP_xx_xy); + LAUNCH_KERNEL(MV_CSR); + LAUNCH_KERNEL(MV_MSR, VO_vc_sub_equal_va, VO_vc_equal_va_mul_vb, + VO_vc_equal_va, DP_xx_xy); + LAUNCH_KERNEL(MV_MSR, DP_xy_yz); + LAUNCH_KERNEL(MV_MSR, VO_vc_sub_equal_va); + LAUNCH_KERNEL(MV_MSR); + LAUNCH_KERNEL(VO_vc_equal_zero); + LAUNCH_KERNEL(VO_vc_equal_va_add_s_mul_vb, VO_vc_equal_zero); + LAUNCH_KERNEL(VO_vc_equal_va, DP_xx_xy); + LAUNCH_KERNEL(VO_2x_vc_add_equal_s_mul_vb); + default: + // couldn't find kernel. + printf("The implementation for following sequence of kernels could not be " + "found:\n"); + for (unsigned idx = 0; idx < NumKernels; ++idx) + printf(" --> %s\n", getKernelKindName(Args[idx].getKind())); + assert(false); + break; + } + + // We are using dual buffering here so that we can reset the structure for the + // next set of kernel. + if (this == KernelArgsSeriesHostVersions) + KernelArgsSeriesHost = KernelArgsSeriesHostVersions + 1; + else + KernelArgsSeriesHost = KernelArgsSeriesHostVersions; + KernelArgsSeriesHost->NumKernels = 0; +} + +void flush(void) { KernelArgsSeriesHost->flush(); } +} // namespace + +extern "C" { +void cs_cuda_map_alloc(const void *Pointer, size_t Size) { + MM.map(Pointer, Size, MemoryManager::maptype_alloc); +} +void cs_cuda_map_to(const void *Pointer, size_t Size) { + MM.map(Pointer, Size, MemoryManager::maptype_to); +} +void cs_cuda_map_from(const void *Pointer, size_t Size) { + MM.map(Pointer, Size, MemoryManager::maptype_from); +} +void cs_cuda_map_from_sync(const void *Pointer, size_t Size) { + MM.map(Pointer, Size, MemoryManager::maptype_from, /*Synchronous = */ true); +} +void cs_cuda_map_release(const void *Pointer, size_t Size) { + MM.map(Pointer, Size, MemoryManager::maptype_release); +} + +int cs_cuda_mat_vec_p_l_csr(bool exclude_diag, + const cs_lnum_t *restrict row_index, + const cs_lnum_t *restrict col_id, + const cs_real_t *restrict val, + const cs_real_t *restrict x, cs_real_t *restrict y, + cs_lnum_t n_rows, cs_lnum_t n_cols) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + // Matrix vector multiplication rows from one block may depend on columns + // computed by other blocks, so we need to flush here. + flush(); + + cs_cuda_map_to(x, n_cols * sizeof(cs_real_t)); + cs_cuda_map_alloc(y, n_rows * sizeof(cs_real_t)); + + if (exclude_diag) + KernelArgsSeriesHost->add( + n_rows, MM.getDevicePtr(row_index), MM.getDevicePtr(col_id), + MM.getDevicePtr(val), MM.getDevicePtr(x), MM.getDevicePtr(y), n_rows, + n_cols); + else + KernelArgsSeriesHost->add(n_rows, MM.getDevicePtr(row_index), + MM.getDevicePtr(col_id), + MM.getDevicePtr(val), MM.getDevicePtr(x), + MM.getDevicePtr(y), n_rows, n_cols); + + cs_cuda_map_release(x, n_cols * sizeof(cs_real_t)); + cs_cuda_map_from_sync(y, n_rows * sizeof(cs_real_t)); + return 1; +} + +int cs_cuda_mat_vec_p_l_msr(bool exclude_diag, + const cs_lnum_t *restrict row_index, + const cs_lnum_t *restrict col_id, + const cs_real_t *restrict x_val, + const cs_real_t *restrict d_val, + const cs_real_t *restrict x, cs_real_t *restrict y, + cs_lnum_t n_rows, cs_lnum_t n_cols) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + // Matrix vector multiplication rows from one block may depend on columns + // computed by other blocks, so we need to flush here. + flush(); + + cs_cuda_map_to(x, n_cols * sizeof(cs_real_t)); + cs_cuda_map_alloc(y, n_rows * sizeof(cs_real_t)); + + if (exclude_diag || !d_val) + KernelArgsSeriesHost->add( + n_rows, MM.getDevicePtr(row_index), MM.getDevicePtr(col_id), + MM.getDevicePtr(x_val), nullptr, MM.getDevicePtr(x), MM.getDevicePtr(y), + n_rows, n_cols); + else + KernelArgsSeriesHost->add( + n_rows, MM.getDevicePtr(row_index), MM.getDevicePtr(col_id), + MM.getDevicePtr(x_val), MM.getDevicePtr(d_val), MM.getDevicePtr(x), + MM.getDevicePtr(y), n_rows, n_cols); + + cs_cuda_map_release(x, n_cols * sizeof(cs_real_t)); + cs_cuda_map_from_sync(y, n_rows * sizeof(cs_real_t)); + return 1; +} + +int cs_cuda_seidel_forward(const cs_lnum_t *restrict row_index, + const cs_lnum_t *restrict col_id, + const cs_real_t *restrict val, + const cs_real_t *restrict ad_inv, + const cs_real_t *restrict rhs, + cs_real_t *restrict vx, cs_lnum_t diag_block_size, + const int *db_size, cs_lnum_t n_rows, + cs_lnum_t n_cols) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + // Matrix vector multiplication rows from one block may depend on columns + // computed by other blocks, so we need to flush here. + flush(); + + cs_cuda_map_to(vx, diag_block_size * n_cols * sizeof(cs_real_t)); + cs_cuda_map_to(rhs, diag_block_size * n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add( + n_rows, MM.getDevicePtr(row_index), MM.getDevicePtr(col_id), + MM.getDevicePtr(val), MM.getDevicePtr(ad_inv), MM.getDevicePtr(rhs), + MM.getDevicePtr(vx), diag_block_size, db_size[0], db_size[1], db_size[2], + db_size[3], n_rows, n_cols); + + flush(); + + cs_cuda_map_release(rhs, 0); + cs_cuda_map_from_sync(vx, 0); + + return 1; +} + +int cs_cuda_seidel_backward( + const cs_lnum_t *restrict row_index, const cs_lnum_t *restrict col_id, + const cs_real_t *restrict val, const cs_real_t *restrict ad_inv, + const cs_real_t *restrict ad, const cs_real_t *restrict rhs, + cs_real_t *restrict vx, cs_real_t *restrict red, cs_lnum_t diag_block_size, + const int *db_size, cs_lnum_t n_rows, cs_lnum_t n_cols) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + // Matrix vector multiplication rows from one block may depend on columns + // computed by other blocks, so we need to flush here. + flush(); + + cs_cuda_map_to(vx, diag_block_size * n_cols * sizeof(cs_real_t)); + cs_cuda_map_to(rhs, diag_block_size * n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add( + n_rows, MM.getDevicePtr(row_index), MM.getDevicePtr(col_id), + MM.getDevicePtr(val), MM.getDevicePtr(ad_inv), MM.getDevicePtr(ad), + MM.getDevicePtr(rhs), MM.getDevicePtr(vx), MM.getReductionVersion(), + MM.getReductionDevPtr(), diag_block_size, db_size[0], db_size[1], + db_size[2], db_size[3], n_rows, n_cols); + + flush(); + + // When we get the reduction results, a synchronization is imposed, so we + // don't have to do it for vx. + cs_cuda_map_release(rhs, 0); + cs_cuda_map_from(vx, 0); + + cs_real_t *res = MM.getReductionResults(); + *red = res[0]; + + return 1; +} + +int cs_cuda_dot_product_xx(cs_real_t *restrict xx, const cs_real_t *restrict x, + cs_lnum_t n_rows) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_cuda_map_to(x, n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add(n_rows, MM.getReductionVersion(), n_rows, + MM.getDevicePtr(x), MM.getReductionDevPtr()); + + // flush(); + + cs_real_t *res = MM.getReductionResults(); + *xx = res[0]; + + cs_cuda_map_release(x, n_rows * sizeof(cs_real_t)); + return 1; +} + +int cs_cuda_dot_product_xx_xy(cs_real_t *restrict xx, cs_real_t *restrict xy, + const cs_real_t *restrict x, + const cs_real_t *restrict y, cs_lnum_t n_rows) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_cuda_map_to(x, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(y, n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add(n_rows, MM.getReductionVersion(), n_rows, + MM.getDevicePtr(x), MM.getDevicePtr(y), + MM.getReductionDevPtr()); + + // flush(); + + cs_real_t *res = MM.getReductionResults(); + *xx = res[0]; + *xy = res[1]; + + cs_cuda_map_release(x, n_rows * sizeof(cs_real_t)); + cs_cuda_map_release(y, n_rows * sizeof(cs_real_t)); + return 1; +} + +int cs_cuda_dot_product_xy_yz(cs_real_t *restrict xy, cs_real_t *restrict yz, + const cs_real_t *restrict x, + const cs_real_t *restrict y, + const cs_real_t *restrict z, cs_lnum_t n_rows) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_cuda_map_to(x, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(y, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(z, n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add( + n_rows, MM.getReductionVersion(), n_rows, MM.getDevicePtr(x), + MM.getDevicePtr(y), MM.getDevicePtr(z), MM.getReductionDevPtr()); + + // flush(); + + cs_real_t *res = MM.getReductionResults(); + *xy = res[1]; + *yz = res[2]; + + cs_cuda_map_release(x, n_rows * sizeof(cs_real_t)); + cs_cuda_map_release(y, n_rows * sizeof(cs_real_t)); + cs_cuda_map_release(z, n_rows * sizeof(cs_real_t)); + return 1; +} + +int cs_cuda_vector_vc_equal_zero_if_exists(cs_lnum_t n_rows, cs_lnum_t n_elems, + cs_real_t *restrict vc) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + // Only have to do something if vc exists in the device. + cs_real_t *DevPtr = MM.getDevicePtr(vc, /* MustExist = */ false); + if (!DevPtr) + return 0; + + KernelArgsSeriesHost->add(n_rows, + /* n_rows = */ n_elems, + /* vc1 = */ DevPtr); + + // flush(); + return 1; +} +int cs_cuda_vector_vc_equal_va(cs_lnum_t n_rows, cs_real_t *restrict vc, + const cs_real_t *restrict va) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_cuda_map_to(va, n_rows * sizeof(cs_real_t)); + cs_cuda_map_alloc(vc, n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add( + n_rows, + /* n_rows = */ n_rows, + /* vc1 = */ MM.getDevicePtr(vc), + /* va = */ MM.getDevicePtr(va)); + + // flush(); + cs_cuda_map_release(va, n_rows * sizeof(cs_real_t)); + cs_cuda_map_from_sync(vc, n_rows * sizeof(cs_real_t)); + return 1; +} +int cs_cuda_vector_vc_sub_equal_va(cs_lnum_t n_rows, cs_real_t *restrict vc, + const cs_real_t *restrict va) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_cuda_map_to(vc, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(va, n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add( + n_rows, + /* n_rows = */ n_rows, + /* vc1 = */ MM.getDevicePtr(vc), + /* va = */ MM.getDevicePtr(va)); + + // flush(); + cs_cuda_map_release(va, n_rows * sizeof(cs_real_t)); + cs_cuda_map_from_sync(vc, n_rows * sizeof(cs_real_t)); + return 1; +} +int cs_cuda_vector_vc_mul_equal_va(cs_lnum_t n_rows, cs_real_t *restrict vc, + const cs_real_t *restrict va) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_cuda_map_to(vc, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(va, n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add( + n_rows, + /* n_rows = */ n_rows, + /* vc1 = */ MM.getDevicePtr(vc), + /* va = */ MM.getDevicePtr(va)); + + // flush(); + cs_cuda_map_release(va, n_rows * sizeof(cs_real_t)); + cs_cuda_map_from_sync(vc, n_rows * sizeof(cs_real_t)); + return 1; +} +int cs_cuda_vector_vc_equal_va_mul_vb(cs_lnum_t n_rows, cs_real_t *restrict vc, + const cs_real_t *restrict va, + const cs_real_t *restrict vb) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_cuda_map_to(va, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(vb, n_rows * sizeof(cs_real_t)); + cs_cuda_map_alloc(vc, n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add( + n_rows, + /* n_rows = */ n_rows, + /* vc1 = */ MM.getDevicePtr(vc), + /* va = */ MM.getDevicePtr(va), + /* vb1 = */ MM.getDevicePtr(vb)); + + // flush(); + cs_cuda_map_release(va, n_rows * sizeof(cs_real_t)); + cs_cuda_map_release(vb, n_rows * sizeof(cs_real_t)); + cs_cuda_map_from_sync(vc, n_rows * sizeof(cs_real_t)); + return 1; +} +int cs_cuda_vector_vc_add_equal_s_mul_vb(cs_lnum_t n_rows, cs_real_t s, + cs_real_t *restrict vc1, + const cs_real_t *restrict vb1, + cs_real_t *restrict vc2, + const cs_real_t *restrict vb2) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_cuda_map_to(vc1, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(vb1, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(vc2, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(vb2, n_rows * sizeof(cs_real_t)); + + if (vc2) + KernelArgsSeriesHost->add( + n_rows, + /* n_rows = */ n_rows, + /* s = */ s, + /* vc1 = */ MM.getDevicePtr(vc1), + /* vc2 = */ MM.getDevicePtr(vc2), + /* vb1 = */ MM.getDevicePtr(vb1), + /* vb2 = */ MM.getDevicePtr(vb2)); + else + KernelArgsSeriesHost->add( + n_rows, + /* n_rows = */ n_rows, + /* s = */ s, + /* vc1 = */ MM.getDevicePtr(vc1), + /* vb1 = */ MM.getDevicePtr(vb1)); + + // flush(); + cs_cuda_map_release(vb1, n_rows * sizeof(cs_real_t)); + cs_cuda_map_release(vb2, n_rows * sizeof(cs_real_t)); + cs_cuda_map_from(vc1, n_rows * sizeof(cs_real_t)); + cs_cuda_map_from_sync(vc2, n_rows * sizeof(cs_real_t)); + return 1; +} +int cs_cuda_vector_vc_equal_va_add_s_mul_vb(cs_lnum_t n_rows, cs_real_t s, + cs_real_t *restrict vc, + const cs_real_t *restrict va, + const cs_real_t *restrict vb) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_cuda_map_to(va, n_rows * sizeof(cs_real_t)); + cs_cuda_map_to(vb, n_rows * sizeof(cs_real_t)); + cs_cuda_map_alloc(vc, n_rows * sizeof(cs_real_t)); + + KernelArgsSeriesHost->add( + n_rows, + /* n_rows = */ n_rows, + /* s = */ s, + /* vc1 = */ MM.getDevicePtr(vc), + /* va = */ MM.getDevicePtr(va), + /* vb1 = */ MM.getDevicePtr(vb)); + + // flush(); + cs_cuda_map_release(va, n_rows * sizeof(cs_real_t)); + cs_cuda_map_release(vb, n_rows * sizeof(cs_real_t)); + cs_cuda_map_from_sync(vc, n_rows * sizeof(cs_real_t)); + return 1; +} + +int cs_cuda_move_to_device_if_exists(cs_lnum_t n_rows, cs_lnum_t n_elems, + cs_real_t *restrict vc) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_real_t *DevPtr = MM.getDevicePtr(vc, /* MustExist = */ false); + if (!DevPtr) + return 0; + + flush(); + CHECK(cudaMemcpyAsync(DevPtr, vc, n_elems * sizeof(cs_real_t), + cudaMemcpyHostToDevice)); + return 1; +} + +int cs_cuda_move_from_device_if_exists(cs_lnum_t n_rows, cs_lnum_t n_elems, + cs_real_t *restrict vc) { + + if (!(NumberNodeDevices && n_rows > CS_CUDA_GPU_THRESHOLD)) + return 0; + + cs_real_t *DevPtr = MM.getDevicePtr(vc, /* MustExist = */ false); + if (!DevPtr) + return 0; + + flush(); + CHECK(cudaMemcpy(vc, DevPtr, n_elems * sizeof(cs_real_t), + cudaMemcpyDeviceToHost)); + return 1; +} + +void *cs_cuda_host_alloc(size_t size) { + if (NumberNodeDevices) { + void *ptr; + CHECK(cudaMallocHost(&ptr, size)); + assert(ptr && "CUDA host malloc returned a nullptr!"); + return ptr; + } + return nullptr; +} +int cs_cuda_host_free(void *ptr) { + if (NumberNodeDevices) { + CHECK(cudaFreeHost(ptr)); + return 1; + } + return 0; +} + +void cs_cuda_initialize(void) { + + int NumberVisibleDevices = 0; + CHECK(cudaGetDeviceCount(&NumberVisibleDevices)); + + if (NumberVisibleDevices) { + // We assume that the way devices are distributed by socket is taken care of + // by the scheduler. Therefore we should at most one device. The scheduler + // must also make sure to use a GPU in the same socket. + assert(NumberVisibleDevices == 1 && + "Expecting to have only one visible device by this rank."); + + DeviceID = 0; + + // Assume OpenMPI is being used. + // TODO: Find a portable way to do it. + const char *LocalSizeStr = getenv("OMPI_COMM_WORLD_LOCAL_SIZE"); + assert(LocalSizeStr && "Cannot find OMPI_COMM_WORLD_LOCAL_SIZE in " + "environment - expecting OpenMPI is in use."); + + NumberRanks = atoi(LocalSizeStr); + assert(NumberRanks > 0 && "Invalid number of ranks."); + + // Hardcode number of devices in the node to 4 - MPS sets number of devices + // to 1 which messes up with out memory allocations. + const char *NumberNodeDevicesString = + getenv("CS_NUMBER_OF_GPUS_IN_THE_SYSTEM"); + if (NumberNodeDevices) { + NumberNodeDevices = atoi(NumberNodeDevicesString); + assert(NumberNodeDevices > 0 && "No CUDA devices found."); + } else + NumberNodeDevices = 4; + + CHECK(cudaSetDevice(DeviceID)); + } else + return; + + assert(DeviceID >= 0 && "Invalid Device ID."); + + // + // Sanity check to see if the GPU is working correctly. We copy a random + // number in and out of the GPU to see if it is working properly. + // + int hRin = 0, hRout = 0; + int *dR = nullptr; + + srand(time(NULL)); + while (hRin == hRout) + hRin = rand(); + + CHECK(cudaMalloc((void **)&dR, sizeof(int))); + CHECK(cudaMemcpy(dR, &hRin, sizeof(int), cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(&hRout, dR, sizeof(int), cudaMemcpyDeviceToHost)); + CHECK(cudaFree(dR)); + + if (hRin == hRout) { + printf(" --> Device %d seems to be working properly\n", DeviceID); + + // Get properties of the target device. + CHECK(cudaGetDeviceProperties(&DeviceProperties, DeviceID)); + + // Initialize persistent storage. + MM.initialize(); + + // Initialize kernel argument buffers. We have two versions so that we can + // do dual buffering, i.e. move one to the device while resetting the other. + CHECK(cudaMallocHost((void **)&KernelArgsSeriesHostVersions, + 2 * sizeof(KernelArgsSeries))); + new (KernelArgsSeriesHostVersions) KernelArgsSeries[2]; + KernelArgsSeriesHost = KernelArgsSeriesHostVersions; + assert(KernelArgsSeriesHost && KernelArgsSeriesHostVersions && + "Invalid pointers."); + + } else { + printf(" --> Device %d is NOT working properly\n", DeviceID); + NumberNodeDevices = 0; + assert(false); + } + return; +} + +void cs_cuda_finalize(void) { + // We only need to clear state if we have any devices. + if (NumberNodeDevices) { + MM.finalize(); + if (KernelArgsSeriesHostVersions) + CHECK(cudaFreeHost(KernelArgsSeriesHostVersions)); + } +} + +void cs_cuda_attempt_host_alloc(cs_real_t **ptr, int elems) { + *ptr = nullptr; + if (NumberNodeDevices) { + CHECK(cudaMallocHost(ptr, elems * sizeof(cs_real_t))); + assert(*ptr && "CUDA host malloc returned a nullptr!"); + } else { + BFT_MALLOC(*ptr, elems, cs_real_t); + assert(*ptr && "Malloc returned invalid pointer."); + } +} + +void cs_cuda_attempt_host_free(cs_real_t *ptr) { + if (NumberNodeDevices) { + CHECK(cudaFreeHost(ptr)); + } else + BFT_FREE(ptr); +} + +} // extern "C" + +#else + +extern "C" { +// CUDA malloc/free defaults to a regular malloc/free if CUDA is not configured. +void cs_cuda_attempt_host_alloc(cs_real_t **ptr, int elems) { + *ptr = nullptr; + BFT_MALLOC(*ptr, elems, cs_real_t); + assert(*ptr && "Malloc returned invalid pointer."); +} +void cs_cuda_attempt_host_free(cs_real_t *ptr) { BFT_FREE(ptr); } +} // extern "C" + +#endif // HAVE_CUDA_OFFLOAD diff --git a/src/cuda/cs_cuda.h b/src/cuda/cs_cuda.h new file mode 100644 index 0000000000..c8ef4de9de --- /dev/null +++ b/src/cuda/cs_cuda.h @@ -0,0 +1,132 @@ +/*============================================================================ + * CUDA offloading support + *============================================================================*/ + +/* + This file is part of Code_Saturne, a general-purpose CFD tool. + + Copyright (C) IBM Corp. 2017, 2018 + + This program is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any later + version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + details. + + You should have received a copy of the GNU General Public License along with + this program; if not, write to the Free Software Foundation, Inc., 51 Franklin + Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +#ifndef __CS_CUDA_H__ +#define __CS_CUDA_H__ + +/*----------------------------------------------------------------------------*/ + +#include "cs_defs.h" + +// +// Utility to execute function controlled by a conditional. +// +#define CS_CUDA_GPU_THRESHOLD GPU_THRESHOLD + +// +// If CUDA offload is not defined, the CUDA related entry points +// do not produce any action +// + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#ifdef HAVE_CUDA_OFFLOAD + +void cs_cuda_initialize(void); +void cs_cuda_finalize(void); +void cs_cuda_map_alloc(const void *Pointer, size_t Size); +void cs_cuda_map_to(const void *Pointer, size_t Size); +void cs_cuda_map_from(const void *Pointer, size_t Size); +void cs_cuda_map_from_sync(const void *Pointer, size_t Size); +void cs_cuda_map_release(const void *Pointer, size_t Size); + +int cs_cuda_mat_vec_p_l_csr(bool exclude_diag, + const cs_lnum_t *restrict row_index, + const cs_lnum_t *restrict col_id, + const cs_real_t *restrict val, + const cs_real_t *restrict x, cs_real_t *restrict y, + cs_lnum_t n_rows, cs_lnum_t n_cols); + +int cs_cuda_mat_vec_p_l_msr(bool exclude_diag, + const cs_lnum_t *restrict row_index, + const cs_lnum_t *restrict col_id, + const cs_real_t *restrict x_val, + const cs_real_t *restrict d_val, + const cs_real_t *restrict x, cs_real_t *restrict y, + cs_lnum_t n_rows, cs_lnum_t n_cols); + +int cs_cuda_seidel_forward(const cs_lnum_t *restrict row_index, + const cs_lnum_t *restrict col_id, + const cs_real_t *restrict val, + const cs_real_t *restrict ad_inv, + const cs_real_t *restrict rhs, + cs_real_t *restrict vx, cs_lnum_t diag_block_size, + const int *db_size, cs_lnum_t n_rows, + cs_lnum_t n_cols); + +int cs_cuda_seidel_backward( + const cs_lnum_t *restrict row_index, const cs_lnum_t *restrict col_id, + const cs_real_t *restrict val, const cs_real_t *restrict ad_inv, + const cs_real_t *restrict ad, const cs_real_t *restrict rhs, + cs_real_t *restrict vx, cs_real_t *restrict red, cs_lnum_t diag_block_size, + const int *db_size, cs_lnum_t n_rows, cs_lnum_t n_cols); + +int cs_cuda_dot_product_xx(cs_real_t *restrict xx, const cs_real_t *restrict x, + cs_lnum_t n_rows); +int cs_cuda_dot_product_xx_xy(cs_real_t *restrict xx, cs_real_t *restrict xy, + const cs_real_t *restrict x, + const cs_real_t *restrict y, cs_lnum_t n_rows); +int cs_cuda_dot_product_xy_yz(cs_real_t *restrict xy, cs_real_t *restrict yz, + const cs_real_t *restrict x, + const cs_real_t *restrict y, + const cs_real_t *restrict z, cs_lnum_t n_rows); +int cs_cuda_vector_vc_equal_zero_if_exists(cs_lnum_t n_rows, cs_lnum_t n_elems, + cs_real_t *restrict vc); +int cs_cuda_vector_vc_equal_va(cs_lnum_t n_rows, cs_real_t *restrict vc, + const cs_real_t *restrict va); +int cs_cuda_vector_vc_sub_equal_va(cs_lnum_t n_rows, cs_real_t *restrict vc, + const cs_real_t *restrict va); +int cs_cuda_vector_vc_mul_equal_va(cs_lnum_t n_rows, cs_real_t *restrict vc, + const cs_real_t *restrict va); +int cs_cuda_vector_vc_equal_va_mul_vb(cs_lnum_t n_rows, cs_real_t *restrict vc, + const cs_real_t *restrict va, + const cs_real_t *restrict vb); +int cs_cuda_vector_vc_add_equal_s_mul_vb(cs_lnum_t n_rows, cs_real_t s, + cs_real_t *restrict vc1, + const cs_real_t *restrict vb1, + cs_real_t *restrict vc2, + const cs_real_t *restrict vb2); +int cs_cuda_vector_vc_equal_va_add_s_mul_vb(cs_lnum_t n_rows, cs_real_t s, + cs_real_t *restrict vc, + const cs_real_t *restrict va, + const cs_real_t *restrict vb); +int cs_cuda_move_to_device_if_exists(cs_lnum_t n_rows, cs_lnum_t n_elems, + cs_real_t *restrict vc); +int cs_cuda_move_from_device_if_exists(cs_lnum_t n_rows, cs_lnum_t n_elems, + cs_real_t *restrict vc); +void *cs_cuda_host_alloc(size_t size_val); +int cs_cuda_host_free(void *ptr); +#endif // HAVE_CUDA_OFFLOAD + +// Functions that must be defined regardless of the CUDA support +// being activated or not. They are used by the FORTRAN modules. +void cs_cuda_attempt_host_alloc(cs_real_t **ptr, int elems); +void cs_cuda_attempt_host_free(cs_real_t *ptr); + +#ifdef __cplusplus +} +#endif //__cplusplus +#endif // __CS_CUDA_H__