From 66d2dff1b30038e9d951a3fe499d80cf58144fe2 Mon Sep 17 00:00:00 2001 From: Ajay Panyala Date: Fri, 20 Sep 2024 00:39:41 -0700 Subject: [PATCH] cleanup scalapack logic, update CC script --- .github/workflows/c-cpp.yaml | 4 +- exachem/cc/scripts/ccsd_advisor.py | 38 ++++++++-- exachem/common/cutils.cpp | 92 +++++++++++------------- exachem/common/cutils.hpp | 36 +++++----- exachem/common/options/input_options.hpp | 2 +- exachem/scf/scf_hartree_fock.cpp | 20 ++---- exachem/scf/scf_iter.cpp | 4 +- exachem/scf/scf_outputs.cpp | 2 +- inputs/scripts/nwchem_to_exachem.py | 4 +- inputs/scripts/xyz_to_exachem.py | 2 - methods/ExaChem.cpp | 5 +- 11 files changed, 107 insertions(+), 102 deletions(-) diff --git a/.github/workflows/c-cpp.yaml b/.github/workflows/c-cpp.yaml index 1cafe9d..87fe9f9 100644 --- a/.github/workflows/c-cpp.yaml +++ b/.github/workflows/c-cpp.yaml @@ -150,8 +150,8 @@ jobs: if: ${{ matrix.os[1] == 'macos' }} shell: bash run: | - echo "CC=gcc-13" >> $GITHUB_ENV - echo "CXX=g++-13" >> $GITHUB_ENV + echo "CC=gcc-14" >> $GITHUB_ENV + echo "CXX=g++-14" >> $GITHUB_ENV echo "LA_VENDOR=OpenBLAS" >> $GITHUB_ENV - name: Cache install steps (backend = ga) diff --git a/exachem/cc/scripts/ccsd_advisor.py b/exachem/cc/scripts/ccsd_advisor.py index 8267be1..f5ea695 100644 --- a/exachem/cc/scripts/ccsd_advisor.py +++ b/exachem/cc/scripts/ccsd_advisor.py @@ -5,6 +5,27 @@ #python ccsd_advisor.py -oa 99 -ob 94 -va 394 -vb 399 -cv 4027 -ppn 4 -ram 512 -ctype uhf -diis 5 -nranks 10 -cache 8 -ts 32 +def get_mo_tiles(noa,nob,nva,nvb,ts): + est_nt = math.ceil(1.0 * noa / ts) + mo_tiles = [] + for x in range(0, est_nt): + mo_tiles.append(int(noa / est_nt + (x < (noa % est_nt)))) + + # est_nt = math.ceil(1.0 * nob / ts) + # for x in range(0, est_nt): + # mo_tiles.append(int(nob / est_nt + (x < (nob % est_nt)))) + + est_nt = math.ceil(1.0 * nva / ts) + for x in range(0, est_nt): + mo_tiles.append(int(nva / est_nt + (x < (nva % est_nt)))) + + # est_nt = math.ceil(1.0 * nvb / ts) + # for x in range(0, est_nt): + # mo_tiles.append(int(nvb / est_nt + (x < (nvb % est_nt)))) + + return mo_tiles + + def parseargs(argv=None): '''Command line options.''' @@ -202,14 +223,13 @@ def parseargs(argv=None): print("nbf: " + str(nbf)) print("\nTotal CPU memory required for Cholesky decomp of the 2e integrals: " + str(chol_mem) + " GiB") -print("\nTotal CPU memory required for CCSD calculation: " + str(ccsd_mem) + " GiB") +print("\nTotal CPU memory required for CCSD calculation: " + str(ccsd_mem) + " GiB\n") VabOab = v_alpha*o_beta*v_beta*o_alpha ts_guess=50 ts_max=ts_guess -tilesizes = list(range(ts_guess, 501, 5)) -tilesizes.insert(0,73) +tilesizes = list(range(ts_guess, 501, 10)) def get_ts_recommendation(tilesizes,nranks): ts_guess_ = tilesizes[0] @@ -217,9 +237,10 @@ def get_ts_recommendation(tilesizes,nranks): nblocks_ = 10 for ts in tilesizes: nblocks = math.ceil(v_alpha/ts) * math.ceil(o_alpha/ts) * math.ceil(v_beta/ts) * math.ceil(o_beta/ts) - # print("nblocks %s for TS = %s " %(nblocks,ts)) + # print (" --> MO Tiles for tilesize %s, nblocks=%s: %s" %(ts, nblocks, get_mo_tiles(o_alpha,o_beta,v_alpha,v_beta,ts))) ts_max_ = ts - if nblocks <= nranks: + #nblocks <= nranks + if (nblocks*1.0/nranks) < 0.31: ts_max_ = ts_guess_ break ts_guess_=ts @@ -229,12 +250,16 @@ def get_ts_recommendation(tilesizes,nranks): [ts_max,nblocks] = get_ts_recommendation(tilesizes,nranks) print("Min #nodes required = %s, nranks = %s, nblocks = %s, max tilesize = %s" %(nnodes, nranks, nblocks, ts_max)) +# print (" --> MO Tiles = %s" %(get_mo_tiles(o_alpha,o_beta,v_alpha,v_beta,ts_max))) nodecounts = list(range(nnodes+10, nnodes*10+1, 10)) for nc in nodecounts: + # print ("-----------------------------------------------------------------------") [ts_max,nblocks] = get_ts_recommendation(tilesizes,nc*ppn) - if nblocks <= nc*ppn: break + # if nblocks <= nc*ppn: break + if (nblocks*1.0/nc*ppn) < 0.31: break print("For node count = %s, nranks = %s, nblocks = %s, max tilesize = %s" %(nc, nc*ppn, nblocks, ts_max)) + # print (" --> MO Tiles = %s" %(get_mo_tiles(o_alpha,o_beta,v_alpha,v_beta,ts_max))) # (T) @@ -326,6 +351,7 @@ def ft_mem(i,j,k,l) : total_extra_buf_mem = round(total_extra_buf_mem,2) total_ccsd_t_mem = ccsd_t_mem + total_extra_buf_mem + total_cache_mem +total_ccsd_t_mem = round(total_ccsd_t_mem,2) print("\nTotal CPU memory required for (T) calculation: " + str(total_ccsd_t_mem) + " GiB" + ", Min nodes required: " + str(math.ceil(total_ccsd_t_mem/cpu_mem))) print("-- memory required for the input tensors: " + str(ccsd_t_mem) + " GiB") diff --git a/exachem/common/cutils.cpp b/exachem/common/cutils.cpp index f84325b..627e5d5 100644 --- a/exachem/common/cutils.cpp +++ b/exachem/common/cutils.cpp @@ -26,50 +26,56 @@ MPI_Comm get_scalapack_comm(tamm::ExecutionContext& ec, int sca_nranks) { return scacomm; } -void setup_scalapack_info(ChemEnv& chem_env, ScalapackInfo& scalapack_info, MPI_Comm& scacomm) { +void setup_scalapack_info(tamm::ExecutionContext& ec, ChemEnv& chem_env, + ScalapackInfo& scalapack_info, ProcGroupData& pgdata) { SystemData& sys_data = chem_env.sys_data; SCFOptions& scf_options = chem_env.ioptions.scf_options; #if defined(USE_UPCXX) abort(); // Not supported with UPC++ #endif - scalapack_info.comm = scacomm; - if(scacomm == MPI_COMM_NULL) return; - auto blacs_setup_st = std::chrono::high_resolution_clock::now(); - // Sanity checks - scalapack_info.npr = scf_options.scalapack_np_row; - scalapack_info.npc = scf_options.scalapack_np_col; - int scalapack_nranks = scalapack_info.npr * scalapack_info.npc; - - scalapack_info.pg = ProcGroup::create_coll(scacomm); - scalapack_info.ec = - ExecutionContext{scalapack_info.pg, DistributionKind::dense, MemoryManagerKind::ga}; - int sca_world_size = scalapack_info.pg.size().value(); - - // Default to square(ish) grid - if(scalapack_nranks == 0) { - int64_t npr = std::sqrt(sca_world_size); - int64_t npc = sca_world_size / npr; - while(npr * npc != sca_world_size) { - npr--; - npc = sca_world_size / npr; - } - scalapack_nranks = sca_world_size; - scalapack_info.npr = npr; - scalapack_info.npc = npc; - } + scalapack_info.npr = scf_options.scalapack_np_row; + scalapack_info.npc = scf_options.scalapack_np_col; + int sca_user_ranks = scalapack_info.npr * scalapack_info.npc; + + // node_p_sca = % of nodes, nbf_p_sca = 4 = % of nbf for scalapack + int sca_nranks = std::ceil(sys_data.nbf_orig * (4 / 100.0)); + // if(node_p_sca > 0) sca_nranks = std::ceil((node_p_sca / 100.0) * nnodes); + if(sca_user_ranks > 0) sca_nranks = sca_user_ranks; + if(sca_nranks > pgdata.spg_nranks) sca_nranks = pgdata.spg_nranks; + // Find nearest square + sca_nranks = std::pow(std::floor(std::sqrt(sca_nranks)), 2); + if(sca_nranks == 0) sca_nranks = 1; + + int sca_nnodes = sca_nranks / pgdata.ppn; + if(sca_nranks % pgdata.ppn > 0 || sca_nnodes == 0) sca_nnodes++; + // if(sca_nnodes > pgdata.spg_nnodes) sca_nnodes = pgdata.spg_nnodes; + + pgdata.scalapack_nnodes = sca_nnodes; + pgdata.scalapack_nranks = sca_nranks; + scalapack_info.scalapack_nranks = sca_nranks; + + // Always use square grid + scalapack_info.npr = std::sqrt(sca_nranks); + scalapack_info.npc = scalapack_info.npr; - EXPECTS(sca_world_size >= scalapack_nranks); + scalapack_info.comm = get_scalapack_comm(ec, sca_nranks); - if(not scalapack_nranks) scalapack_nranks = sca_world_size; - std::vector scalapack_ranks(scalapack_nranks); + if(scalapack_info.comm == MPI_COMM_NULL) return; + // auto blacs_setup_st = std::chrono::high_resolution_clock::now(); + + scalapack_info.pg = ProcGroup::create_coll(scalapack_info.comm); + scalapack_info.ec = + ExecutionContext{scalapack_info.pg, DistributionKind::dense, MemoryManagerKind::ga}; + + std::vector scalapack_ranks(sca_nranks); std::iota(scalapack_ranks.begin(), scalapack_ranks.end(), 0); - scalapack_info.scalapack_nranks = scalapack_nranks; int& mb_ = scf_options.scalapack_nb; if(scalapack_info.pg.rank() == 0) { - std::cout << "scalapack_nranks = " << scalapack_nranks << std::endl; + std::cout << "scalapack_nnodes = " << sca_nnodes << std::endl; + std::cout << "scalapack_nranks = " << sca_nranks << std::endl; std::cout << "scalapack_np_row = " << scalapack_info.npr << std::endl; std::cout << "scalapack_np_col = " << scalapack_info.npc << std::endl; std::cout << "scalapack_nb = " << mb_ << std::endl; @@ -90,9 +96,9 @@ void setup_scalapack_info(ChemEnv& chem_env, ScalapackInfo& scalapack_info, MPI_ scalapack_info.blockcyclic_dist = std::make_unique(*scalapack_info.blacs_grid, mb_, mb_, 0, 0); - auto blacs_setup_en = std::chrono::high_resolution_clock::now(); + // auto blacs_setup_en = std::chrono::high_resolution_clock::now(); - std::chrono::duration blacs_time = blacs_setup_en - blacs_setup_st; + // std::chrono::duration blacs_time = blacs_setup_en - blacs_setup_st; // if(scalapack_info.pg.rank() == 0) // std::cout << std::fixed << std::setprecision(2) << std::endl @@ -103,7 +109,7 @@ void setup_scalapack_info(ChemEnv& chem_env, ScalapackInfo& scalapack_info, MPI_ // Nbf, % of nodes, % of Nbf, nnodes from input file, (% of nodes, % of nbf) for scalapack ProcGroupData get_spg_data(ExecutionContext& ec, const size_t N, const int node_p, const int nbf_p, - const int node_inp, const int node_p_sca, const int nbf_p_sca) { + const int node_inp) { ProcGroupData pgdata; pgdata.nnodes = ec.nnodes(); pgdata.ppn = ec.ppn(); @@ -126,23 +132,5 @@ ProcGroupData get_spg_data(ExecutionContext& ec, const size_t N, const int node_ pgdata.spg_nnodes = spg_nnodes; pgdata.spg_nranks = spg_nranks; -#if defined(USE_SCALAPACK) - // Find nearest square - int sca_nranks = std::ceil(N * (nbf_p_sca / 100.0)); - if(node_p_sca > 0) sca_nranks = std::ceil((node_p_sca / 100.0) * nnodes); - if(sca_nranks > spg_nranks) sca_nranks = spg_nranks; - sca_nranks = std::pow(std::floor(std::sqrt(sca_nranks)), 2); - if(sca_nranks == 0) sca_nranks = 1; - int sca_nnodes = sca_nranks / ppn; - if(sca_nranks % ppn > 0 || sca_nnodes == 0) sca_nnodes++; - if(sca_nnodes > nnodes) sca_nnodes = nnodes; - // if(sca_nnodes == 1) ppn = sca_nranks; // single node case - pgdata.scalapack_nnodes = sca_nnodes; - pgdata.scalapack_nranks = sca_nranks; -#else - pgdata.scalapack_nnodes = spg_nnodes; - pgdata.scalapack_nranks = spg_nranks; -#endif - return pgdata; } diff --git a/exachem/common/cutils.hpp b/exachem/common/cutils.hpp index 16285c7..70c2fbf 100644 --- a/exachem/common/cutils.hpp +++ b/exachem/common/cutils.hpp @@ -53,6 +53,19 @@ struct ScalapackInfo { }; #endif +// Contains node, ppn information used for creating a smaller process group from world group +struct ProcGroupData { + int nnodes{}; // total number of nodes + int spg_nnodes{}; // number of nodes in smaller process group + int ppn{}; // processes per node + int spg_nranks{}; // number of rank in smaller process group + // #nodes used for scalapack operations can further be a subset of the smaller process group + int scalapack_nnodes{}; + int scalapack_nranks{}; + + auto unpack() { return std::make_tuple(nnodes, spg_nnodes, ppn, spg_nranks); } +}; + #if defined(USE_SCALAPACK) struct ScalapackInfo { int64_t npr{}, npc{}, scalapack_nranks{}; @@ -66,25 +79,10 @@ struct ScalapackInfo { MPI_Comm get_scalapack_comm(tamm::ExecutionContext& ec, int sca_nranks); -void setup_scalapack_info(ChemEnv& chem_env, ScalapackInfo& scalapack_info, MPI_Comm& scacomm); +void setup_scalapack_info(tamm::ExecutionContext& ec, ChemEnv& chem_env, + ScalapackInfo& scalapack_info, ProcGroupData& pgdata); #endif -// Contains node, ppn information used for creating a smaller process group from world group -struct ProcGroupData { - int nnodes{}; // total number of nodes - int spg_nnodes{}; // number of nodes in smaller process group - int ppn{}; // processes per node - int spg_nranks{}; // number of rank in smaller process group - // #nodes used for scalapack operations can further be a subset of the smaller process group - int scalapack_nnodes{}; - int scalapack_nranks{}; - - auto unpack() { - return std::make_tuple(nnodes, spg_nnodes, ppn, spg_nranks, scalapack_nnodes, scalapack_nranks); - } -}; - -// Nbf, % of nodes, % of Nbf, nnodes from input file, (% of nodes, % of nbf) for scalapack +// Nbf, % of nodes, % of Nbf, nnodes from input file ProcGroupData get_spg_data(ExecutionContext& ec, const size_t N, const int node_p, - const int nbf_p = -1, const int node_inp = -1, const int node_p_sca = -1, - const int nbf_p_sca = -1); + const int nbf_p = -1, const int node_inp = -1); diff --git a/exachem/common/options/input_options.hpp b/exachem/common/options/input_options.hpp index 164b380..af65a4c 100644 --- a/exachem/common/options/input_options.hpp +++ b/exachem/common/options/input_options.hpp @@ -43,7 +43,7 @@ class SCFOptions: public CommonOptions { int multiplicity{1}; double lshift{0}; // level shift factor, +ve value b/w 0 and 1 double tol_int{1e-22}; // tolerance for integral primitive screening - double tol_sch{1e-10}; // tolerance for schwarz screening + double tol_sch{1e-12}; // tolerance for schwarz screening double tol_lindep{1e-5}; // tolerance for linear dependencies double conve{1e-8}; // energy convergence double convd{1e-7}; // density convergence diff --git a/exachem/scf/scf_hartree_fock.cpp b/exachem/scf/scf_hartree_fock.cpp index 5017187..dd8310b 100644 --- a/exachem/scf/scf_hartree_fock.cpp +++ b/exachem/scf/scf_hartree_fock.cpp @@ -54,8 +54,8 @@ void exachem::scf::SCFHartreeFock::scf_hf(ExecutionContext& exc, ChemEnv& chem_e // -------------Everythin related to Basis Sets----------------------------- #if SCF_THROTTLE_RESOURCES - ProcGroupData pgdata = get_spg_data(exc, N, -1, 50, chem_env.ioptions.scf_options.nnodes, -1, 4); - auto [t_nnodes, hf_nnodes, ppn, hf_nranks, sca_nnodes, sca_nranks] = pgdata.unpack(); + ProcGroupData pgdata = get_spg_data(exc, N, -1, 50, chem_env.ioptions.scf_options.nnodes); + auto [t_nnodes, hf_nnodes, ppn, hf_nranks] = pgdata.unpack(); #if defined(USE_UPCXX) bool in_new_team = (rank < hf_nranks); @@ -76,9 +76,6 @@ void exachem::scf::SCFHartreeFock::scf_hf(ExecutionContext& exc, ChemEnv& chem_e MPI_Group_free(&hfgroup); #endif -#if defined(USE_SCALAPACK) - MPI_Comm scacomm = get_scalapack_comm(exc, sca_nranks); -#endif #endif if(rank == 0) { @@ -86,10 +83,6 @@ void exachem::scf::SCFHartreeFock::scf_hf(ExecutionContext& exc, ChemEnv& chem_e #if SCF_THROTTLE_RESOURCES std::cout << "Number of nodes, processes per node used for SCF calculation: " << hf_nnodes << ", " << ppn << std::endl; -#endif -#if defined(USE_SCALAPACK) - cout << "Number of nodes, processes per node, total processes used for Scalapack operations: " - << sca_nnodes << ", " << sca_nranks / sca_nnodes << ", " << sca_nranks << endl; #endif chem_env.ioptions.common_options.print(); chem_env.ioptions.scf_options.print(); @@ -268,7 +261,8 @@ void exachem::scf::SCFHartreeFock::scf_hf(ExecutionContext& exc, ChemEnv& chem_e ScalapackInfo scalapack_info; #if defined(USE_SCALAPACK) - setup_scalapack_info(chem_env, scalapack_info, scacomm); + setup_scalapack_info(ec, chem_env, scalapack_info, pgdata); + MPI_Comm scacomm = scalapack_info.comm; #endif #if defined(USE_GAUXC) @@ -585,12 +579,12 @@ void exachem::scf::SCFHartreeFock::scf_hf(ExecutionContext& exc, ChemEnv& chem_e Matrix S(chem_env.sys_data.nbf_orig, chem_env.sys_data.nbf_orig); tamm_to_eigen_tensor(ttensors.S1, S); if(chem_env.sys_data.is_restricted) - cout << "debug #electrons = " << (int) std::ceil((etensors.D_alpha * S).trace()) + cout << "debug #electrons = " << (int) std::round((etensors.D_alpha * S).trace()) << endl; if(chem_env.sys_data.is_unrestricted) { - cout << "debug #alpha electrons = " << (int) std::ceil((etensors.D_alpha * S).trace()) + cout << "debug #alpha electrons = " << (int) std::round((etensors.D_alpha * S).trace()) << endl; - cout << "debug #beta electrons = " << (int) std::ceil((etensors.D_beta * S).trace()) + cout << "debug #beta electrons = " << (int) std::round((etensors.D_beta * S).trace()) << endl; } } diff --git a/exachem/scf/scf_iter.cpp b/exachem/scf/scf_iter.cpp index 5f40673..9c90cec 100644 --- a/exachem/scf/scf_iter.cpp +++ b/exachem/scf/scf_iter.cpp @@ -23,7 +23,6 @@ std::tuple exachem::scf::SCFIter::scf_iter_body( const bool is_uhf = sys_data.is_unrestricted; const bool is_rhf = sys_data.is_restricted; - const bool do_snK = sys_data.do_snK; const double lshift = scf_vars.lshift; Tensor& H1 = ttensors.H1; @@ -59,6 +58,7 @@ std::tuple exachem::scf::SCFIter::scf_iter_body( double ehf = 0.0; #if defined(USE_GAUXC) + const bool do_snK = sys_data.do_snK; if(do_snK) { const auto snK_start = std::chrono::high_resolution_clock::now(); scf::gauxc::compute_exx(ec, chem_env, scf_vars, ttensors, etensors, @@ -448,8 +448,6 @@ void exachem::scf::SCFIter::compute_2c_ints(ExecutionContext& ec, ChemEnv& chem_ SCFOptions& scf_options = chem_env.ioptions.scf_options; - auto rank = ec.pg().rank(); - const libint2::BasisSet& dfbs = scf_vars.dfbs; const std::vector& dfAO_tiles = scf_vars.dfAO_tiles; const std::vector& df_shell_tile_map = scf_vars.df_shell_tile_map; diff --git a/exachem/scf/scf_outputs.cpp b/exachem/scf/scf_outputs.cpp index 0a078d6..bb6548e 100644 --- a/exachem/scf/scf_outputs.cpp +++ b/exachem/scf/scf_outputs.cpp @@ -62,7 +62,7 @@ void exachem::scf::SCFIO::print_energies(ExecutionContext& ec, ChemEnv& chem_env } if(ec.pg().rank() == 0) { - std::cout << "#electrons = " << (int) std::ceil(nelectrons) << endl; + std::cout << "#electrons = " << (int) std::round(nelectrons) << endl; std::cout << "1e energy kinetic = " << std::setprecision(16) << kinetic_1e << endl; std::cout << "1e energy N-e = " << NE_1e << endl; std::cout << "1e energy = " << energy_1e << endl; diff --git a/inputs/scripts/nwchem_to_exachem.py b/inputs/scripts/nwchem_to_exachem.py index 69946b9..a4e2166 100644 --- a/inputs/scripts/nwchem_to_exachem.py +++ b/inputs/scripts/nwchem_to_exachem.py @@ -110,7 +110,7 @@ def parse_nwchem_input(input_file): nwchem_opt["SCF"]["PRINT"] = {} scf_opt = nwchem_opt["SCF"] scf_opt["tol_int"] = 1e-22 - scf_opt["tol_sch"] = 1e-10 + scf_opt["tol_sch"] = 1e-12 scf_opt["tol_lindep"] = 1e-5 scf_opt["conve"] = 1e-8 scf_opt["convd"] = 1e-7 @@ -129,7 +129,7 @@ def parse_nwchem_input(input_file): dft_opt = nwchem_opt["DFT"] dft_opt["hfexch"] = False dft_opt["tol_int"] = 1e-22 - dft_opt["tol_sch"] = 1e-10 + dft_opt["tol_sch"] = 1e-12 dft_opt["tol_lindep"] = 1e-5 dft_opt["conve"] = 1e-8 dft_opt["convd"] = 1e-7 diff --git a/inputs/scripts/xyz_to_exachem.py b/inputs/scripts/xyz_to_exachem.py index 9cc4b2a..b82546a 100644 --- a/inputs/scripts/xyz_to_exachem.py +++ b/inputs/scripts/xyz_to_exachem.py @@ -126,8 +126,6 @@ def dict_to_json(dictname): exachem_opt["SCF"] = {} scf_opt = exachem_opt["SCF"] - scf_opt["tol_int"] = 1e-12 - scf_opt["tol_lindep"] = 1e-5 scf_opt["conve"] = 1e-8 scf_opt["convd"] = 1e-7 scf_opt["diis_hist"] = 10 diff --git a/methods/ExaChem.cpp b/methods/ExaChem.cpp index ae1f974..fab43b8 100644 --- a/methods/ExaChem.cpp +++ b/methods/ExaChem.cpp @@ -70,8 +70,11 @@ int main(int argc, char* argv[]) { std::cout << "nnodes: " << ec.nnodes() << ", "; std::cout << "nproc_per_node: " << ec.ppn() << ", "; std::cout << "nproc_total: " << ec.nnodes() * ec.ppn() << ", "; +#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_DPCPP) std::cout << "ngpus_per_node: " << ec.gpn() << ", "; - std::cout << "ngpus_total: " << ec.nnodes() * ec.gpn() << endl << endl; + std::cout << "ngpus_total: " << ec.nnodes() * ec.gpn() << endl; +#endif + std::cout << std::endl; ec.print_mem_info(); }