Skip to content

Performance opt eurohack #318

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions alpine/AlpineManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ class AlpineManager
ippl::NDIndex<Dim> domain_m;
std::array<bool, Dim> decomp_m;
double rhoNorm_m;
Kokkos::View<int*> index_m;
Kokkos::View<int*> start_m;
Kokkos::View<int*> permuteTemp_m;

public:
size_type getTotalP() const { return totalP_m; }
Expand Down Expand Up @@ -129,6 +132,7 @@ class AlpineManager
Vector_t<double, Dim> hr = hr_m;

scatter(*q, *rho, *R);

double relError = std::fabs((Q-(*rho).sum())/Q);

m << relError << endl;
Expand Down
240 changes: 240 additions & 0 deletions alpine/ExamplesWithoutPicManager/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
// Uniform Plasma Test
// Usage:
// srun ./UniformPlasmaTest
// <nx> [<ny>...] <Np> <Nt> <stype>
// <lbthres> --overallocate <ovfactor> --info 10
// nx = No. cell-centered points in the x-direction
// ny... = No. cell-centered points in the y-, z-, ...direction
// Np = Total no. of macro-particles in the simulation
// Nt = Number of time steps
// stype = Field solver type (FFT, CG, P3M, and OPEN supported)
// lbfreq = Load balancing frequency i.e., Number of time steps after which particle
// load balancing should happen
// ovfactor = Over-allocation factor for the buffers used in the communication. Typical
// values are 1.0, 2.0. Value 1.0 means no over-allocation.
// Example:
// srun ./UniformPlasmaTest 128 128 128 10000 10 FFT 10 --overallocate 1.0 --info 10
//
#include <Kokkos_Random.hpp>
#include <chrono>
#include <iostream>
#include <random>
#include <set>
#include <string>
#include <vector>

#include "Utility/IpplTimings.h"

#include "ChargedParticles.hpp"

constexpr unsigned Dim = 3;

const char* TestName = "UniformPlasmaTest";

int main(int argc, char* argv[]) {
ippl::initialize(argc, argv);
{
setSignalHandler();

Inform msg("UniformPlasmaTest");
Inform msg2all(argv[0], INFORM_ALL_NODES);

auto start = std::chrono::high_resolution_clock::now();
int arg = 1;

Vector_t<int, Dim> nr;
for (unsigned d = 0; d < Dim; d++)
nr[d] = std::atoi(argv[arg++]);

static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("total");
static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation");
static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData");
static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("pushVelocity");
static IpplTimings::TimerRef temp = IpplTimings::getTimer("randomMove");
static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("pushPosition");
static IpplTimings::TimerRef updateTimer = IpplTimings::getTimer("update");
static IpplTimings::TimerRef DummySolveTimer = IpplTimings::getTimer("solveWarmup");
static IpplTimings::TimerRef SolveTimer = IpplTimings::getTimer("solve");
static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("loadBalance");

IpplTimings::startTimer(mainTimer);

const size_type totalP = std::atoll(argv[arg++]);
const unsigned int nt = std::atoi(argv[arg++]);

msg << "Uniform Plasma Test" << endl
<< "nt " << nt << " Np= " << totalP << " grid = " << nr << endl;

using bunch_type = ChargedParticles<PLayout_t<double, Dim>, double, Dim>;

std::unique_ptr<bunch_type> P;

ippl::NDIndex<Dim> domain;
for (unsigned i = 0; i < Dim; i++) {
domain[i] = ippl::Index(nr[i]);
}

// create mesh and layout objects for this problem domain
Vector_t<double, Dim> rmin(0.0);
Vector_t<double, Dim> rmax(20.0);

Vector_t<double, Dim> hr = rmax / nr;
Vector_t<double, Dim> origin = rmin;
const double dt = 1.0;

std::array<bool, Dim> isParallel;
isParallel.fill(true);

const bool isAllPeriodic = true;
Mesh_t<Dim> mesh(domain, hr, origin);
FieldLayout_t<Dim> FL(MPI_COMM_WORLD, domain, isParallel, isAllPeriodic);
PLayout_t<double, Dim> PL(FL, mesh);

double Q = -1562.5;
std::string solver = argv[arg++];
P = std::make_unique<bunch_type>(PL, hr, rmin, rmax, isParallel, Q, solver);

P->nr_m = nr;
size_type nloc = totalP / ippl::Comm->size();

int rest = (int)(totalP - nloc * ippl::Comm->size());

if (ippl::Comm->rank() < rest)
++nloc;

IpplTimings::startTimer(particleCreation);
P->create(nloc);

const ippl::NDIndex<Dim>& lDom = FL.getLocalNDIndex();
Vector_t<double, Dim> Rmin, Rmax;
for (unsigned d = 0; d < Dim; ++d) {
Rmin[d] = origin[d] + lDom[d].first() * hr[d];
Rmax[d] = origin[d] + (lDom[d].last() + 1) * hr[d];
}

Kokkos::View<ippl::Vector<double, Dim>*> particle_pos;

Kokkos::parallel_for(nloc,
KOKKOS_LAMBDA(size_t i) {
Rview(i) = i * hr[0];
});
Kokkos::fence();
P->q = P->Q_m / totalP;
P->P = 0.0;
IpplTimings::stopTimer(particleCreation);

P->initializeFields(mesh, FL);

IpplTimings::startTimer(updateTimer);
P->update();
IpplTimings::stopTimer(updateTimer);

msg << "particles created and initial conditions assigned " << endl;

P->initSolver();
P->time_m = 0.0;
P->loadbalancefreq_m = std::atoi(argv[arg++]);

IpplTimings::startTimer(DummySolveTimer);
P->rho_m = 0.0;
P->runSolver();
IpplTimings::stopTimer(DummySolveTimer);

P->scatterCIC(totalP, 0, hr);
P->initializeORB(FL, mesh);
bool fromAnalyticDensity = false;

IpplTimings::startTimer(SolveTimer);
P->runSolver();
IpplTimings::stopTimer(SolveTimer);

P->gatherCIC();

IpplTimings::startTimer(dumpDataTimer);
P->dumpData();
P->gatherStatistics(totalP);
IpplTimings::stopTimer(dumpDataTimer);

// begin main timestep loop
msg << "Starting iterations ..." << endl;
// P->gatherStatistics(totalP);
for (unsigned int it = 0; it < nt; it++) {
// LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration
// Here, we assume a constant charge-to-mass ratio of -1 for
// all the particles hence eliminating the need to store mass as
// an attribute
// kick
IpplTimings::startTimer(PTimer);
P->P = P->P - 0.5 * dt * P->E;
IpplTimings::stopTimer(PTimer);

IpplTimings::startTimer(temp);
Kokkos::parallel_for(
P->getLocalNum(),
generate_random<Vector_t<double, Dim>, Kokkos::Random_XorShift64_Pool<>, Dim>(
P->P.getView(), rand_pool64, -hr, hr));
Kokkos::fence();
IpplTimings::stopTimer(temp);

// drift
IpplTimings::startTimer(RTimer);
P->R = P->R + dt * P->P;
IpplTimings::stopTimer(RTimer);

// Since the particles have moved spatially update them to correct processors
IpplTimings::startTimer(updateTimer);
P->update();
IpplTimings::stopTimer(updateTimer);

// Domain Decomposition
if (P->balance(totalP, it + 1)) {
msg << "Starting repartition" << endl;
IpplTimings::startTimer(domainDecomposition);
P->repartition(FL, mesh, fromAnalyticDensity);
IpplTimings::stopTimer(domainDecomposition);
}

// scatter the charge onto the underlying grid
P->scatterCIC(totalP, it + 1, hr);

// Field solve
IpplTimings::startTimer(SolveTimer);
P->runSolver();
IpplTimings::stopTimer(SolveTimer);

// gather E field
P->gatherCIC();

// kick
IpplTimings::startTimer(PTimer);
P->P = P->P - 0.5 * dt * P->E;
IpplTimings::stopTimer(PTimer);

P->time_m += dt;
IpplTimings::startTimer(dumpDataTimer);
P->dumpData();
P->gatherStatistics(totalP);
IpplTimings::stopTimer(dumpDataTimer);
msg << "Finished time step: " << it + 1 << " time: " << P->time_m << endl;

if (checkSignalHandler()) {
msg << "Aborting timestepping loop due to signal " << interruptSignalReceived
<< endl;
break;
}
}

msg << "Uniform Plasma Test: End." << endl;
IpplTimings::stopTimer(mainTimer);
IpplTimings::print();
IpplTimings::print(std::string("timing.dat"));
auto end = std::chrono::high_resolution_clock::now();

std::chrono::duration<double> time_chrono =
std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
std::cout << "Elapsed time: " << time_chrono.count() << std::endl;
}
ippl::finalize();

return 0;
}
Loading