Skip to content

Commit

Permalink
cleanup scalapack logic, update CC script
Browse files Browse the repository at this point in the history
  • Loading branch information
ajaypanyala committed Sep 20, 2024
1 parent ec4146a commit 66d2dff
Show file tree
Hide file tree
Showing 11 changed files with 107 additions and 102 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/c-cpp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ jobs:
if: ${{ matrix.os[1] == 'macos' }}
shell: bash
run: |
echo "CC=gcc-13" >> $GITHUB_ENV
echo "CXX=g++-13" >> $GITHUB_ENV
echo "CC=gcc-14" >> $GITHUB_ENV
echo "CXX=g++-14" >> $GITHUB_ENV
echo "LA_VENDOR=OpenBLAS" >> $GITHUB_ENV
- name: Cache install steps (backend = ga)
Expand Down
38 changes: 32 additions & 6 deletions exachem/cc/scripts/ccsd_advisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,27 @@

#python ccsd_advisor.py -oa 99 -ob 94 -va 394 -vb 399 -cv 4027 -ppn 4 -ram 512 -ctype uhf -diis 5 -nranks 10 -cache 8 -ts 32

def get_mo_tiles(noa,nob,nva,nvb,ts):
est_nt = math.ceil(1.0 * noa / ts)
mo_tiles = []
for x in range(0, est_nt):
mo_tiles.append(int(noa / est_nt + (x < (noa % est_nt))))

# est_nt = math.ceil(1.0 * nob / ts)
# for x in range(0, est_nt):
# mo_tiles.append(int(nob / est_nt + (x < (nob % est_nt))))

est_nt = math.ceil(1.0 * nva / ts)
for x in range(0, est_nt):
mo_tiles.append(int(nva / est_nt + (x < (nva % est_nt))))

# est_nt = math.ceil(1.0 * nvb / ts)
# for x in range(0, est_nt):
# mo_tiles.append(int(nvb / est_nt + (x < (nvb % est_nt))))

return mo_tiles


def parseargs(argv=None):

'''Command line options.'''
Expand Down Expand Up @@ -202,24 +223,24 @@ def parseargs(argv=None):
print("nbf: " + str(nbf))

print("\nTotal CPU memory required for Cholesky decomp of the 2e integrals: " + str(chol_mem) + " GiB")
print("\nTotal CPU memory required for CCSD calculation: " + str(ccsd_mem) + " GiB")
print("\nTotal CPU memory required for CCSD calculation: " + str(ccsd_mem) + " GiB\n")


VabOab = v_alpha*o_beta*v_beta*o_alpha
ts_guess=50
ts_max=ts_guess
tilesizes = list(range(ts_guess, 501, 5))
tilesizes.insert(0,73)
tilesizes = list(range(ts_guess, 501, 10))

def get_ts_recommendation(tilesizes,nranks):
ts_guess_ = tilesizes[0]
ts_max_ = tilesizes[0]
nblocks_ = 10
for ts in tilesizes:
nblocks = math.ceil(v_alpha/ts) * math.ceil(o_alpha/ts) * math.ceil(v_beta/ts) * math.ceil(o_beta/ts)
# print("nblocks %s for TS = %s " %(nblocks,ts))
# print (" --> MO Tiles for tilesize %s, nblocks=%s: %s" %(ts, nblocks, get_mo_tiles(o_alpha,o_beta,v_alpha,v_beta,ts)))
ts_max_ = ts
if nblocks <= nranks:
#nblocks <= nranks
if (nblocks*1.0/nranks) < 0.31:
ts_max_ = ts_guess_
break
ts_guess_=ts
Expand All @@ -229,12 +250,16 @@ def get_ts_recommendation(tilesizes,nranks):

[ts_max,nblocks] = get_ts_recommendation(tilesizes,nranks)
print("Min #nodes required = %s, nranks = %s, nblocks = %s, max tilesize = %s" %(nnodes, nranks, nblocks, ts_max))
# print (" --> MO Tiles = %s" %(get_mo_tiles(o_alpha,o_beta,v_alpha,v_beta,ts_max)))

nodecounts = list(range(nnodes+10, nnodes*10+1, 10))
for nc in nodecounts:
# print ("-----------------------------------------------------------------------")
[ts_max,nblocks] = get_ts_recommendation(tilesizes,nc*ppn)
if nblocks <= nc*ppn: break
# if nblocks <= nc*ppn: break
if (nblocks*1.0/nc*ppn) < 0.31: break
print("For node count = %s, nranks = %s, nblocks = %s, max tilesize = %s" %(nc, nc*ppn, nblocks, ts_max))
# print (" --> MO Tiles = %s" %(get_mo_tiles(o_alpha,o_beta,v_alpha,v_beta,ts_max)))


# (T)
Expand Down Expand Up @@ -326,6 +351,7 @@ def ft_mem(i,j,k,l) :
total_extra_buf_mem = round(total_extra_buf_mem,2)

total_ccsd_t_mem = ccsd_t_mem + total_extra_buf_mem + total_cache_mem
total_ccsd_t_mem = round(total_ccsd_t_mem,2)

print("\nTotal CPU memory required for (T) calculation: " + str(total_ccsd_t_mem) + " GiB" + ", Min nodes required: " + str(math.ceil(total_ccsd_t_mem/cpu_mem)))
print("-- memory required for the input tensors: " + str(ccsd_t_mem) + " GiB")
Expand Down
92 changes: 40 additions & 52 deletions exachem/common/cutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,50 +26,56 @@ MPI_Comm get_scalapack_comm(tamm::ExecutionContext& ec, int sca_nranks) {
return scacomm;
}

void setup_scalapack_info(ChemEnv& chem_env, ScalapackInfo& scalapack_info, MPI_Comm& scacomm) {
void setup_scalapack_info(tamm::ExecutionContext& ec, ChemEnv& chem_env,
ScalapackInfo& scalapack_info, ProcGroupData& pgdata) {
SystemData& sys_data = chem_env.sys_data;
SCFOptions& scf_options = chem_env.ioptions.scf_options;
#if defined(USE_UPCXX)
abort(); // Not supported with UPC++
#endif
scalapack_info.comm = scacomm;
if(scacomm == MPI_COMM_NULL) return;
auto blacs_setup_st = std::chrono::high_resolution_clock::now();
// Sanity checks
scalapack_info.npr = scf_options.scalapack_np_row;
scalapack_info.npc = scf_options.scalapack_np_col;
int scalapack_nranks = scalapack_info.npr * scalapack_info.npc;

scalapack_info.pg = ProcGroup::create_coll(scacomm);
scalapack_info.ec =
ExecutionContext{scalapack_info.pg, DistributionKind::dense, MemoryManagerKind::ga};

int sca_world_size = scalapack_info.pg.size().value();

// Default to square(ish) grid
if(scalapack_nranks == 0) {
int64_t npr = std::sqrt(sca_world_size);
int64_t npc = sca_world_size / npr;
while(npr * npc != sca_world_size) {
npr--;
npc = sca_world_size / npr;
}
scalapack_nranks = sca_world_size;
scalapack_info.npr = npr;
scalapack_info.npc = npc;
}
scalapack_info.npr = scf_options.scalapack_np_row;
scalapack_info.npc = scf_options.scalapack_np_col;
int sca_user_ranks = scalapack_info.npr * scalapack_info.npc;

// node_p_sca = % of nodes, nbf_p_sca = 4 = % of nbf for scalapack
int sca_nranks = std::ceil(sys_data.nbf_orig * (4 / 100.0));
// if(node_p_sca > 0) sca_nranks = std::ceil((node_p_sca / 100.0) * nnodes);
if(sca_user_ranks > 0) sca_nranks = sca_user_ranks;
if(sca_nranks > pgdata.spg_nranks) sca_nranks = pgdata.spg_nranks;
// Find nearest square
sca_nranks = std::pow(std::floor(std::sqrt(sca_nranks)), 2);
if(sca_nranks == 0) sca_nranks = 1;

int sca_nnodes = sca_nranks / pgdata.ppn;
if(sca_nranks % pgdata.ppn > 0 || sca_nnodes == 0) sca_nnodes++;
// if(sca_nnodes > pgdata.spg_nnodes) sca_nnodes = pgdata.spg_nnodes;

pgdata.scalapack_nnodes = sca_nnodes;
pgdata.scalapack_nranks = sca_nranks;
scalapack_info.scalapack_nranks = sca_nranks;

// Always use square grid
scalapack_info.npr = std::sqrt(sca_nranks);
scalapack_info.npc = scalapack_info.npr;

EXPECTS(sca_world_size >= scalapack_nranks);
scalapack_info.comm = get_scalapack_comm(ec, sca_nranks);

if(not scalapack_nranks) scalapack_nranks = sca_world_size;
std::vector<int64_t> scalapack_ranks(scalapack_nranks);
if(scalapack_info.comm == MPI_COMM_NULL) return;
// auto blacs_setup_st = std::chrono::high_resolution_clock::now();

scalapack_info.pg = ProcGroup::create_coll(scalapack_info.comm);
scalapack_info.ec =
ExecutionContext{scalapack_info.pg, DistributionKind::dense, MemoryManagerKind::ga};

std::vector<int64_t> scalapack_ranks(sca_nranks);
std::iota(scalapack_ranks.begin(), scalapack_ranks.end(), 0);
scalapack_info.scalapack_nranks = scalapack_nranks;

int& mb_ = scf_options.scalapack_nb;

if(scalapack_info.pg.rank() == 0) {
std::cout << "scalapack_nranks = " << scalapack_nranks << std::endl;
std::cout << "scalapack_nnodes = " << sca_nnodes << std::endl;
std::cout << "scalapack_nranks = " << sca_nranks << std::endl;
std::cout << "scalapack_np_row = " << scalapack_info.npr << std::endl;
std::cout << "scalapack_np_col = " << scalapack_info.npc << std::endl;
std::cout << "scalapack_nb = " << mb_ << std::endl;
Expand All @@ -90,9 +96,9 @@ void setup_scalapack_info(ChemEnv& chem_env, ScalapackInfo& scalapack_info, MPI_
scalapack_info.blockcyclic_dist =
std::make_unique<scalapackpp::BlockCyclicDist2D>(*scalapack_info.blacs_grid, mb_, mb_, 0, 0);

auto blacs_setup_en = std::chrono::high_resolution_clock::now();
// auto blacs_setup_en = std::chrono::high_resolution_clock::now();

std::chrono::duration<double> blacs_time = blacs_setup_en - blacs_setup_st;
// std::chrono::duration<double> blacs_time = blacs_setup_en - blacs_setup_st;

// if(scalapack_info.pg.rank() == 0)
// std::cout << std::fixed << std::setprecision(2) << std::endl
Expand All @@ -103,7 +109,7 @@ void setup_scalapack_info(ChemEnv& chem_env, ScalapackInfo& scalapack_info, MPI_

// Nbf, % of nodes, % of Nbf, nnodes from input file, (% of nodes, % of nbf) for scalapack
ProcGroupData get_spg_data(ExecutionContext& ec, const size_t N, const int node_p, const int nbf_p,
const int node_inp, const int node_p_sca, const int nbf_p_sca) {
const int node_inp) {
ProcGroupData pgdata;
pgdata.nnodes = ec.nnodes();
pgdata.ppn = ec.ppn();
Expand All @@ -126,23 +132,5 @@ ProcGroupData get_spg_data(ExecutionContext& ec, const size_t N, const int node_
pgdata.spg_nnodes = spg_nnodes;
pgdata.spg_nranks = spg_nranks;

#if defined(USE_SCALAPACK)
// Find nearest square
int sca_nranks = std::ceil(N * (nbf_p_sca / 100.0));
if(node_p_sca > 0) sca_nranks = std::ceil((node_p_sca / 100.0) * nnodes);
if(sca_nranks > spg_nranks) sca_nranks = spg_nranks;
sca_nranks = std::pow(std::floor(std::sqrt(sca_nranks)), 2);
if(sca_nranks == 0) sca_nranks = 1;
int sca_nnodes = sca_nranks / ppn;
if(sca_nranks % ppn > 0 || sca_nnodes == 0) sca_nnodes++;
if(sca_nnodes > nnodes) sca_nnodes = nnodes;
// if(sca_nnodes == 1) ppn = sca_nranks; // single node case
pgdata.scalapack_nnodes = sca_nnodes;
pgdata.scalapack_nranks = sca_nranks;
#else
pgdata.scalapack_nnodes = spg_nnodes;
pgdata.scalapack_nranks = spg_nranks;
#endif

return pgdata;
}
36 changes: 17 additions & 19 deletions exachem/common/cutils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,19 @@ struct ScalapackInfo {
};
#endif

// Contains node, ppn information used for creating a smaller process group from world group
struct ProcGroupData {
int nnodes{}; // total number of nodes
int spg_nnodes{}; // number of nodes in smaller process group
int ppn{}; // processes per node
int spg_nranks{}; // number of rank in smaller process group
// #nodes used for scalapack operations can further be a subset of the smaller process group
int scalapack_nnodes{};
int scalapack_nranks{};

auto unpack() { return std::make_tuple(nnodes, spg_nnodes, ppn, spg_nranks); }
};

#if defined(USE_SCALAPACK)
struct ScalapackInfo {
int64_t npr{}, npc{}, scalapack_nranks{};
Expand All @@ -66,25 +79,10 @@ struct ScalapackInfo {

MPI_Comm get_scalapack_comm(tamm::ExecutionContext& ec, int sca_nranks);

void setup_scalapack_info(ChemEnv& chem_env, ScalapackInfo& scalapack_info, MPI_Comm& scacomm);
void setup_scalapack_info(tamm::ExecutionContext& ec, ChemEnv& chem_env,
ScalapackInfo& scalapack_info, ProcGroupData& pgdata);
#endif

// Contains node, ppn information used for creating a smaller process group from world group
struct ProcGroupData {
int nnodes{}; // total number of nodes
int spg_nnodes{}; // number of nodes in smaller process group
int ppn{}; // processes per node
int spg_nranks{}; // number of rank in smaller process group
// #nodes used for scalapack operations can further be a subset of the smaller process group
int scalapack_nnodes{};
int scalapack_nranks{};

auto unpack() {
return std::make_tuple(nnodes, spg_nnodes, ppn, spg_nranks, scalapack_nnodes, scalapack_nranks);
}
};

// Nbf, % of nodes, % of Nbf, nnodes from input file, (% of nodes, % of nbf) for scalapack
// Nbf, % of nodes, % of Nbf, nnodes from input file
ProcGroupData get_spg_data(ExecutionContext& ec, const size_t N, const int node_p,
const int nbf_p = -1, const int node_inp = -1, const int node_p_sca = -1,
const int nbf_p_sca = -1);
const int nbf_p = -1, const int node_inp = -1);
2 changes: 1 addition & 1 deletion exachem/common/options/input_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class SCFOptions: public CommonOptions {
int multiplicity{1};
double lshift{0}; // level shift factor, +ve value b/w 0 and 1
double tol_int{1e-22}; // tolerance for integral primitive screening
double tol_sch{1e-10}; // tolerance for schwarz screening
double tol_sch{1e-12}; // tolerance for schwarz screening
double tol_lindep{1e-5}; // tolerance for linear dependencies
double conve{1e-8}; // energy convergence
double convd{1e-7}; // density convergence
Expand Down
20 changes: 7 additions & 13 deletions exachem/scf/scf_hartree_fock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ void exachem::scf::SCFHartreeFock::scf_hf(ExecutionContext& exc, ChemEnv& chem_e
// -------------Everythin related to Basis Sets-----------------------------

#if SCF_THROTTLE_RESOURCES
ProcGroupData pgdata = get_spg_data(exc, N, -1, 50, chem_env.ioptions.scf_options.nnodes, -1, 4);
auto [t_nnodes, hf_nnodes, ppn, hf_nranks, sca_nnodes, sca_nranks] = pgdata.unpack();
ProcGroupData pgdata = get_spg_data(exc, N, -1, 50, chem_env.ioptions.scf_options.nnodes);
auto [t_nnodes, hf_nnodes, ppn, hf_nranks] = pgdata.unpack();

#if defined(USE_UPCXX)
bool in_new_team = (rank < hf_nranks);
Expand All @@ -76,20 +76,13 @@ void exachem::scf::SCFHartreeFock::scf_hf(ExecutionContext& exc, ChemEnv& chem_e
MPI_Group_free(&hfgroup);
#endif

#if defined(USE_SCALAPACK)
MPI_Comm scacomm = get_scalapack_comm(exc, sca_nranks);
#endif
#endif

if(rank == 0) {
std::cout << std::endl;
#if SCF_THROTTLE_RESOURCES
std::cout << "Number of nodes, processes per node used for SCF calculation: " << hf_nnodes
<< ", " << ppn << std::endl;
#endif
#if defined(USE_SCALAPACK)
cout << "Number of nodes, processes per node, total processes used for Scalapack operations: "
<< sca_nnodes << ", " << sca_nranks / sca_nnodes << ", " << sca_nranks << endl;
#endif
chem_env.ioptions.common_options.print();
chem_env.ioptions.scf_options.print();
Expand Down Expand Up @@ -268,7 +261,8 @@ void exachem::scf::SCFHartreeFock::scf_hf(ExecutionContext& exc, ChemEnv& chem_e

ScalapackInfo scalapack_info;
#if defined(USE_SCALAPACK)
setup_scalapack_info(chem_env, scalapack_info, scacomm);
setup_scalapack_info(ec, chem_env, scalapack_info, pgdata);
MPI_Comm scacomm = scalapack_info.comm;
#endif

#if defined(USE_GAUXC)
Expand Down Expand Up @@ -585,12 +579,12 @@ void exachem::scf::SCFHartreeFock::scf_hf(ExecutionContext& exc, ChemEnv& chem_e
Matrix S(chem_env.sys_data.nbf_orig, chem_env.sys_data.nbf_orig);
tamm_to_eigen_tensor(ttensors.S1, S);
if(chem_env.sys_data.is_restricted)
cout << "debug #electrons = " << (int) std::ceil((etensors.D_alpha * S).trace())
cout << "debug #electrons = " << (int) std::round((etensors.D_alpha * S).trace())
<< endl;
if(chem_env.sys_data.is_unrestricted) {
cout << "debug #alpha electrons = " << (int) std::ceil((etensors.D_alpha * S).trace())
cout << "debug #alpha electrons = " << (int) std::round((etensors.D_alpha * S).trace())
<< endl;
cout << "debug #beta electrons = " << (int) std::ceil((etensors.D_beta * S).trace())
cout << "debug #beta electrons = " << (int) std::round((etensors.D_beta * S).trace())
<< endl;
}
}
Expand Down
4 changes: 1 addition & 3 deletions exachem/scf/scf_iter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ std::tuple<TensorType, TensorType> exachem::scf::SCFIter::scf_iter_body(

const bool is_uhf = sys_data.is_unrestricted;
const bool is_rhf = sys_data.is_restricted;
const bool do_snK = sys_data.do_snK;
const double lshift = scf_vars.lshift;

Tensor<TensorType>& H1 = ttensors.H1;
Expand Down Expand Up @@ -59,6 +58,7 @@ std::tuple<TensorType, TensorType> exachem::scf::SCFIter::scf_iter_body(
double ehf = 0.0;

#if defined(USE_GAUXC)
const bool do_snK = sys_data.do_snK;
if(do_snK) {
const auto snK_start = std::chrono::high_resolution_clock::now();
scf::gauxc::compute_exx<TensorType>(ec, chem_env, scf_vars, ttensors, etensors,
Expand Down Expand Up @@ -448,8 +448,6 @@ void exachem::scf::SCFIter::compute_2c_ints(ExecutionContext& ec, ChemEnv& chem_

SCFOptions& scf_options = chem_env.ioptions.scf_options;

auto rank = ec.pg().rank();

const libint2::BasisSet& dfbs = scf_vars.dfbs;
const std::vector<Tile>& dfAO_tiles = scf_vars.dfAO_tiles;
const std::vector<size_t>& df_shell_tile_map = scf_vars.df_shell_tile_map;
Expand Down
2 changes: 1 addition & 1 deletion exachem/scf/scf_outputs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void exachem::scf::SCFIO::print_energies(ExecutionContext& ec, ChemEnv& chem_env
}

if(ec.pg().rank() == 0) {
std::cout << "#electrons = " << (int) std::ceil(nelectrons) << endl;
std::cout << "#electrons = " << (int) std::round(nelectrons) << endl;
std::cout << "1e energy kinetic = " << std::setprecision(16) << kinetic_1e << endl;
std::cout << "1e energy N-e = " << NE_1e << endl;
std::cout << "1e energy = " << energy_1e << endl;
Expand Down
4 changes: 2 additions & 2 deletions inputs/scripts/nwchem_to_exachem.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def parse_nwchem_input(input_file):
nwchem_opt["SCF"]["PRINT"] = {}
scf_opt = nwchem_opt["SCF"]
scf_opt["tol_int"] = 1e-22
scf_opt["tol_sch"] = 1e-10
scf_opt["tol_sch"] = 1e-12
scf_opt["tol_lindep"] = 1e-5
scf_opt["conve"] = 1e-8
scf_opt["convd"] = 1e-7
Expand All @@ -129,7 +129,7 @@ def parse_nwchem_input(input_file):
dft_opt = nwchem_opt["DFT"]
dft_opt["hfexch"] = False
dft_opt["tol_int"] = 1e-22
dft_opt["tol_sch"] = 1e-10
dft_opt["tol_sch"] = 1e-12
dft_opt["tol_lindep"] = 1e-5
dft_opt["conve"] = 1e-8
dft_opt["convd"] = 1e-7
Expand Down
Loading

0 comments on commit 66d2dff

Please sign in to comment.