diff --git a/include/picongpu/simulation/control/Simulation.hpp b/include/picongpu/simulation/control/Simulation.hpp index 8b8e6666e4..10baa225fe 100644 --- a/include/picongpu/simulation/control/Simulation.hpp +++ b/include/picongpu/simulation/control/Simulation.hpp @@ -39,6 +39,7 @@ #include "picongpu/random/seed/ISeed.hpp" #include "picongpu/simulation/control/DomainAdjuster.hpp" #include "picongpu/simulation/control/MovingWindow.hpp" +#include "picongpu/simulation/control/checkpointingState.hpp" #include "picongpu/simulation/stage/AtomicPhysics.hpp" #include "picongpu/simulation/stage/Collision.hpp" #include "picongpu/simulation/stage/CurrentBackground.hpp" @@ -69,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -91,6 +93,8 @@ namespace picongpu { using namespace pmacc; + using SimHelper = SimulationHelper>; + /** * Global simulation controller class. * @@ -99,7 +103,7 @@ namespace picongpu * * @tparam DIM the dimension (2-3) for the simulation */ - class Simulation : public SimulationHelper + class Simulation : public SimHelper { public: /** @@ -109,7 +113,7 @@ namespace picongpu void pluginRegisterHelp(po::options_description& desc) override { - SimulationHelper::pluginRegisterHelp(desc); + SimHelper::pluginRegisterHelp(desc); // clang-format off desc.add_options()( @@ -159,7 +163,7 @@ namespace picongpu void startSimulation() override { if(!skipSimulation) - SimulationHelper::startSimulation(); + SimHelper::startSimulation(); } nlohmann::json metadata() const @@ -283,7 +287,7 @@ namespace picongpu log("rank %1%; localsize %2%; localoffset %3%;") % myGPUpos.toString() % gridSizeLocal.toString() % gridOffset.toString(); - SimulationHelper::pluginLoad(); + SimHelper::pluginLoad(); GridLayout layout(gridSizeLocal, GuardSize::toRT() * SuperCellSize::toRT()); cellDescription = std::make_unique(layout.sizeND(), DataSpace(GuardSize::toRT())); @@ -304,7 +308,7 @@ namespace picongpu { DataConnector& dc = Environment<>::get().DataConnector(); - SimulationHelper::pluginUnload(); + SimHelper::pluginUnload(); /** unshare all registered ISimulationData sets * @@ -456,36 +460,10 @@ namespace picongpu if(initialiserController) { initialiserController->printInformation(); - if(this->restartRequested) - { - /* we do not require '--checkpoint.restart.step' if a master checkpoint file is found */ - if(this->restartStep < 0) - { - std::vector checkpoints = readCheckpointMasterFile(); - - if(checkpoints.empty()) - { - if(this->tryRestart == false) - { - throw std::runtime_error( - "Restart failed. You must provide the " - "'--checkpoint.restart.step' argument. See picongpu --help."); - } - else - { - // no checkpoint found: start simulation from scratch - this->restartRequested = false; - } - } - else - this->restartStep = checkpoints.back(); - } - } - - if(this->restartRequested) + if(this->checkpointing.checkRestart(step)) { - initialiserController->restart((uint32_t) this->restartStep, this->restartDirectory); - step = this->restartStep; + step = static_cast(this->checkpointing.getRestartStep()); + initialiserController->restart(step, this->checkpointing.getRestartDir()); } else { @@ -547,13 +525,13 @@ namespace picongpu void dumpOneStep(uint32_t currentStep) override { fieldBackground->toDumpState(currentStep); - SimulationHelper::dumpOneStep(currentStep); + SimHelper::dumpOneStep(currentStep); } void notifyPlugins(uint32_t currentStep) override { fieldBackground->toPluginState(currentStep); - SimulationHelper::notifyPlugins(currentStep); + SimHelper::notifyPlugins(currentStep); } void movingWindowCheck(uint32_t currentStep) override diff --git a/include/picongpu/simulation/control/checkpointingState.hpp b/include/picongpu/simulation/control/checkpointingState.hpp new file mode 100644 index 0000000000..4fd2fceeb6 --- /dev/null +++ b/include/picongpu/simulation/control/checkpointingState.hpp @@ -0,0 +1,30 @@ +/* Copyright 2025 Tapish Narwal + * + * This file is part of PIConGPU. + * + * PIConGPU is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * PIConGPU is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with PIConGPU. + * If not, see . + */ + +#pragma once + +namespace picongpu +{ + static constexpr bool checkpointingEnabled = +#if (ENABLE_OPENPMD == 1) + true; +#else + false; +#endif +} // namespace picongpu diff --git a/include/pmacc/simulationControl/Checkpointing.hpp b/include/pmacc/simulationControl/Checkpointing.hpp new file mode 100644 index 0000000000..aaac2a6e6c --- /dev/null +++ b/include/pmacc/simulationControl/Checkpointing.hpp @@ -0,0 +1,440 @@ +/* Copyright 2025 Tapish Narwal + * + * This file is part of PMacc. + * + * PMacc is free software: you can redistribute it and/or modify + * it under the terms of either the GNU General Public License or + * the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * PMacc is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License and the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * and the GNU Lesser General Public License along with PMacc. + * If not, see . + */ + + +#pragma once + +#include "pmacc/Environment.hpp" +#include "pmacc/filesystem.hpp" +#include "pmacc/mappings/simulation/Filesystem.hpp" +#include "pmacc/mappings/simulation/GridController.hpp" +#include "pmacc/pluginSystem/Slice.hpp" +#include "pmacc/pluginSystem/containsStep.hpp" +#include "pmacc/pluginSystem/toSlice.hpp" +#include "pmacc/simulationControl/signal.hpp" +#include "pmacc/types.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace pmacc::simulationControl +{ + // State of restart. + // DISABLED means no restart is attempted. + // TRY means a restart is attempted, but the simulation will continue even if restart fails. + // FORCE means a restart is attempted, and the simulation will fail if restart fails. + // SUCCESS means a restart was successful. + // FAILED means a restart failed. + enum class RestartState : uint8_t + { + DISABLED, + TRY, + FORCE, + SUCCESS, + FAILED + }; + + /** + * @brief Stream insertion operator for RestartState, converting enum to string. + * @param out The output stream. + * @param state The RestartState value to write. + * @return The output stream. + */ + inline std::ostream& operator<<(std::ostream& out, RestartState const& state) + { + static std::unordered_map const stateToString + = {{RestartState::DISABLED, "disabled"}, + {RestartState::TRY, "try"}, + {RestartState::FORCE, "force"}, + {RestartState::SUCCESS, "success"}, + {RestartState::FAILED, "failed"}}; + + if(auto it = stateToString.find(state); it != stateToString.end()) + { + out << it->second; + } + else + { + // Handle unknown enum values gracefully. + out << "UNKNOWN_RESTART_STATE(" << static_cast(state) << ")"; + } + return out; + } + + /** + * @brief Stream extraction operator for RestartState, converting string to enum. + * @param in The input stream. + * @param state The RestartState variable to populate. + * @return The input stream. + */ + inline std::istream& operator>>(std::istream& in, RestartState& state) + { + static std::unordered_map const stringToState + = {{"disabled", RestartState::DISABLED}, + {"try", RestartState::TRY}, + {"force", RestartState::FORCE}, + {"success", RestartState::SUCCESS}, + {"failed", RestartState::FAILED}}; + + std::string token; + in >> token; + + if(auto it = stringToState.find(token); it != stringToState.end()) + { + state = it->second; + } + else + { + // If the token is not a valid state, set the stream's failbit. + // boost::program_options will catch this and report an error to the user. + in.setstate(std::ios_base::failbit); + } + return in; + } + + /** + * @brief Manages simulation checkpointing and restarting. + * + * This class provides functionalities to periodically save the simulation state + * (checkpointing) and to resume a simulation from a saved state (restarting). + * The checkpointing can be triggered based on simulation steps or wall-clock time. + * The entire functionality can be enabled or disabled at compile time via the + * `checkpointingEnabled` template parameter. + * + * @tparam checkpointingEnabled A boolean to enable/disable checkpointing features. + */ + template + struct Checkpointing + { + using SeqOfTimeSlices = std::vector; + + void registerHelp(po::options_description& desc) + { + if constexpr(checkpointingEnabled) + { + // clang-format off + desc.add_options() + ("checkpoint.restart.loop", po::value(&softRestarts)->default_value(0), + "Number of times to restart the simulation after simulation has finished (for presentations). " + "Note: does not yet work with all plugins, see issue #1305") + ("checkpoint.restart", po::value(&restartState)->zero_tokens()->implicit_value(RestartState::FORCE), + "Restart simulation from a checkpoint. Requires a valid checkpoint.") + ("checkpoint.tryRestart", po::value(&restartState)->zero_tokens()->implicit_value(RestartState::TRY), + "Try to restart if a checkpoint is available else start the simulation from scratch.") + ("checkpoint.restart.directory", po::value(&restartDirectory)->default_value(restartDirectory), + "Directory containing checkpoints for a restart") + ("checkpoint.restart.step", po::value(&restartStep), + "Checkpoint step to restart from") + ("checkpoint.period", po::value(&checkpointPeriod), + "Period for checkpoint creation [interval(s) based on steps]") + ("checkpoint.timePeriod", po::value(&checkpointPeriodMinutes), + "Time periodic checkpoint creation [period in minutes]") + ("checkpoint.directory", po::value(&checkpointDirectory)->default_value(checkpointDirectory), + "Directory for checkpoints"); + // clang-format on + // translate checkpointPeriod string into checkpoint intervals + seqCheckpointPeriod = pluginSystem::toTimeSlice(checkpointPeriod); + } + } + + void addCheckpoint(uint32_t signalMaxTimestep) + { + if constexpr(checkpointingEnabled) + { + seqCheckpointPeriod.push_back(pluginSystem::Slice(signalMaxTimestep, signalMaxTimestep)); + } + else + { + std::cout << "Checkpointing is disabled, no checkpoint will be created." << std::endl; + } + } + + template + void dump(uint32_t currentStep) + { + if constexpr(checkpointingEnabled) + { + /* trigger checkpoint notification */ + if(pluginSystem::containsStep(seqCheckpointPeriod, currentStep)) + { + /* first synchronize: if something failed, we can spare the time + * for the checkpoint writing */ + alpaka::wait(manager::Device::get().current()); + + // avoid deadlock between not finished PMacc tasks and MPI_Barrier + eventSystem::getTransactionEvent().waitForFinished(); + + GridController& gc = Environment::get().GridController(); + /* can be spared for better scalings, but allows to spare the + * time for checkpointing if some ranks died */ + MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); + + /* create directory containing checkpoints */ + if(numCheckpoints == 0 && gc.getGlobalRank() == 0) + { + pmacc::Filesystem::get().createDirectoryWithPermissions(checkpointDirectory); + } + + Environment::get().PluginConnector().checkpointPlugins(currentStep, checkpointDirectory); + + /* important synchronize: only if no errors occured until this + * point guarantees that a checkpoint is usable */ + alpaka::wait(manager::Device::get().current()); + + /* avoid deadlock between not finished PMacc tasks and MPI_Barrier */ + eventSystem::getTransactionEvent().waitForFinished(); + + /* \todo in an ideal world with MPI-3, this would be an + * MPI_Ibarrier call and this function would return a MPI_Request + * that could be checked */ + MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); + + if(gc.getGlobalRank() == 0) + { + writeCheckpointStep(currentStep); + } + numCheckpoints++; + } + } + } + + bool doSoftRestart() + { + static uint32_t nthSoftRestart = 0; + if(nthSoftRestart <= softRestarts) + { + nthSoftRestart++; + return true; + } + return false; + } + + void doTimeBasedCheckpointing() + { + if constexpr(checkpointingEnabled) + { + // register concurrent thread to perform checkpointing periodically after a user defined time + if(checkpointPeriodMinutes != 0) + checkpointTimeThread = std::thread( + [&, this]() + { + std::unique_lock lk(this->concurrentThreadMutex); + while(exitConcurrentThreads.wait_until( + lk, + std::chrono::system_clock::now() + std::chrono::minutes(checkpointPeriodMinutes)) + == std::cv_status::timeout) + { + signal::detail::setCreateCheckpoint(1); + } + }); + } + } + + void endTimeBasedCheckpointing() + { + if constexpr(checkpointingEnabled) + { + { + // notify all concurrent threads to exit + std::unique_lock lk(this->concurrentThreadMutex); + exitConcurrentThreads.notify_all(); + } + // wait for time triggered checkpoint thread + if(checkpointTimeThread.joinable()) + checkpointTimeThread.join(); + } + } + + /* Returns whether a restart needs to be performed */ + bool checkRestart(uint32_t& step) + { + if constexpr(checkpointingEnabled) + { + if(restartState != RestartState::TRY && restartState != RestartState::FORCE) + { + return false; + } + std::vector checkpoints = readCheckpointMasterFile(); + + // If no specific restart step is given, default to the latest available checkpoint. + if(restartStep < 0) + { + if(checkpoints.empty()) + { + if(restartState == RestartState::FORCE) + { + throw std::runtime_error( + "Restart failed. No checkpoints found and no '--checkpoint.restart.step' provided."); + } + restartState = RestartState::FAILED; + return false; + } + restartStep = checkpoints.back(); + } + + // Checkpoints are expected to be sorted chronologically. + bool const stepFound = std::binary_search(checkpoints.cbegin(), checkpoints.cend(), restartStep); + + if(!stepFound) + { + if(restartState == RestartState::FORCE) + { + throw std::runtime_error( + "Restart failed. Checkpoint for step " + std::to_string(restartStep) + " not found."); + } + restartState = RestartState::FAILED; + return false; + } + + // At this point, restart is possible. + restartState = RestartState::SUCCESS; + return true; + } + return false; + } + + [[nodiscard]] RestartState getRestartState() const + { + return restartState; + } + + [[nodiscard]] int32_t getRestartStep() const + { + return restartStep; + } + + [[nodiscard]] std::string const& getRestartDir() const + { + return restartDirectory; + } + + private: + /** Presentations: loop the whole simulation `softRestarts` times from + * initial step to runSteps */ + uint32_t softRestarts{0}; + + /* period for checkpoint creation [interval(s) based on steps]*/ + std::string checkpointPeriod; + + /* checkpoint intervals */ + SeqOfTimeSlices seqCheckpointPeriod; + + /* period for checkpoint creation [period in minutes] + * Zero is disabling time depended checkpointing. + */ + std::uint64_t checkpointPeriodMinutes = 0u; + std::thread checkpointTimeThread; + + // conditional variable to notify all concurrent threads and signal exit of the simulation + std::condition_variable exitConcurrentThreads; + std::mutex concurrentThreadMutex; + + /* common directory for checkpoints */ + std::string checkpointDirectory{"checkpoints"}; + + /* number of checkpoints written */ + uint32_t numCheckpoints{0}; + + /* checkpoint step to restart from */ + int32_t restartStep{-1}; + + /* common directory for restarts */ + std::string restartDirectory{"checkpoints"}; + + /* filename for checkpoint master file with all checkpoint timesteps */ + static constexpr std::string_view CHECKPOINT_MASTER_FILE{"checkpoints.txt"}; + + RestartState restartState{RestartState::DISABLED}; + + /** + * Append \p checkpointStep to the master checkpoint file + * + * @param checkpointStep current checkpoint step + */ + void writeCheckpointStep(uint32_t const checkpointStep) + { + stdfs::path const checkpointMasterFile = stdfs::path(checkpointDirectory) / CHECKPOINT_MASTER_FILE; + + std::ofstream file(checkpointMasterFile, std::ofstream::app); + + if(!file) + { + throw std::runtime_error("Failed to write checkpoint master file: " + checkpointMasterFile.string()); + } + + file << checkpointStep << '\n'; + } + + /** + * Reads the checkpoint master file if any and returns all found checkpoint steps + * + * @return vector of found checkpoints steps in order they appear in the file + */ + std::vector readCheckpointMasterFile() + { + std::vector checkpoints; + + stdfs::path const checkpointMasterFile = stdfs::path(restartDirectory) / CHECKPOINT_MASTER_FILE; + + if(!stdfs::exists(checkpointMasterFile)) + { + return checkpoints; + } + + std::ifstream file(checkpointMasterFile); + if(!file) + { + std::cerr << "Warning: Could not open checkpoint master file: " << checkpointMasterFile << std::endl; + return checkpoints; + } + + /* read each line */ + std::string line; + while(std::getline(file, line)) + { + if(line.empty()) + { + continue; + } + + uint32_t step; + auto const [ptr, ec] = std::from_chars(line.data(), line.data() + line.size(), step); + if(ec == std::errc{} && ptr == line.data() + line.size()) + { + checkpoints.push_back(step); + } + else + { + std::cerr << "Warning: checkpoint master file contains invalid data (" << line << ")" << std::endl; + } + } + + return checkpoints; + } + }; + +} // namespace pmacc::simulationControl diff --git a/include/pmacc/simulationControl/SimulationHelper.cpp b/include/pmacc/simulationControl/SimulationHelper.cpp index 425e382528..e693e7d50d 100644 --- a/include/pmacc/simulationControl/SimulationHelper.cpp +++ b/include/pmacc/simulationControl/SimulationHelper.cpp @@ -27,52 +27,33 @@ #include "pmacc/dataManagement/DataConnector.hpp" #include "pmacc/dimensions/DataSpace.hpp" #include "pmacc/eventSystem/Manager.hpp" -#include "pmacc/filesystem.hpp" -#include "pmacc/mappings/simulation/Filesystem.hpp" -#include "pmacc/mappings/simulation/GridController.hpp" #include "pmacc/particles/IdProvider.hpp" #include "pmacc/pluginSystem/IPlugin.hpp" #include "pmacc/pluginSystem/containsStep.hpp" #include "pmacc/pluginSystem/toSlice.hpp" +#include "pmacc/simulationControl/Checkpointing.hpp" #include "pmacc/simulationControl/signal.hpp" #include "pmacc/types.hpp" #include -#include -#include -#include #include #include #include #include -#include namespace pmacc { - template - SimulationHelper::SimulationHelper() - : checkpointDirectory("checkpoints") - , restartDirectory("checkpoints") - , CHECKPOINT_MASTER_FILE("checkpoints.txt") - , author("") - + template + SimulationHelper::SimulationHelper() : author("") { tSimulation.toggleStart(); tInit.toggleStart(); } - template - SimulationHelper::~SimulationHelper() + template + SimulationHelper::~SimulationHelper() { - { - // notify all concurrent threads to exit - std::unique_lock lk(this->concurrentThreadMutex); - exitConcurrentThreads.notify_all(); - } - // wait for time triggered checkpoint thread - if(checkpointTimeThread.joinable()) - checkpointTimeThread.join(); - + checkpointing.endTimeBasedCheckpointing(); tSimulation.toggleEnd(); if(output) { @@ -82,8 +63,8 @@ namespace pmacc } } - template - void SimulationHelper::notifyPlugins(uint32_t currentStep) + template + void SimulationHelper::notifyPlugins(uint32_t currentStep) { Environment::get().PluginConnector().notifyPlugins(currentStep); /* Handle signals after we executed the plugins but before checkpointing, this will result into lower @@ -92,54 +73,14 @@ namespace pmacc checkSignals(currentStep); } - template - void SimulationHelper::dumpOneStep(uint32_t currentStep) + template + void SimulationHelper::dumpOneStep(uint32_t currentStep) { - /* trigger checkpoint notification */ - if(pluginSystem::containsStep(seqCheckpointPeriod, currentStep)) - { - /* first synchronize: if something failed, we can spare the time - * for the checkpoint writing */ - alpaka::wait(manager::Device::get().current()); - - // avoid deadlock between not finished PMacc tasks and MPI_Barrier - eventSystem::getTransactionEvent().waitForFinished(); - - GridController& gc = Environment::get().GridController(); - /* can be spared for better scalings, but allows to spare the - * time for checkpointing if some ranks died */ - MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); - - /* create directory containing checkpoints */ - if(numCheckpoints == 0 && gc.getGlobalRank() == 0) - { - pmacc::Filesystem::get().createDirectoryWithPermissions(checkpointDirectory); - } - - Environment::get().PluginConnector().checkpointPlugins(currentStep, checkpointDirectory); - - /* important synchronize: only if no errors occured until this - * point guarantees that a checkpoint is usable */ - alpaka::wait(manager::Device::get().current()); - - /* avoid deadlock between not finished PMacc tasks and MPI_Barrier */ - eventSystem::getTransactionEvent().waitForFinished(); - - /* \todo in an ideal world with MPI-3, this would be an - * MPI_Ibarrier call and this function would return a MPI_Request - * that could be checked */ - MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); - - if(gc.getGlobalRank() == 0) - { - writeCheckpointStep(currentStep); - } - numCheckpoints++; - } + checkpointing.template dump(currentStep); } - template - void SimulationHelper::dumpTimes( + template + void SimulationHelper::dumpTimes( TimeIntervall& tSimCalculation, TimeIntervall&, double& roundAvg, @@ -173,8 +114,8 @@ namespace pmacc } } - template - void SimulationHelper::startSimulation() + template + void SimulationHelper::startSimulation() { if(useMpiDirect) Environment<>::get().enableMpiDirect(); @@ -182,20 +123,7 @@ namespace pmacc // Install a signal handler signal::activate(); - // register concurrent thread to perform checkpointing periodically after a user defined time - if(checkpointPeriodMinutes != 0) - checkpointTimeThread = std::thread( - [&, this]() - { - std::unique_lock lk(this->concurrentThreadMutex); - while(exitConcurrentThreads.wait_until( - lk, - std::chrono::system_clock::now() + std::chrono::minutes(checkpointPeriodMinutes)) - == std::cv_status::timeout) - { - signal::detail::setCreateCheckpoint(1); - } - }); + checkpointing.doTimeBasedCheckpointing(); uint64_t maxRanks = Environment::get().GridController().getGpuNodes().productOfComponents(); uint64_t rank = Environment::get().GridController().getScalarPosition(); @@ -206,10 +134,7 @@ namespace pmacc init(); - // translate checkpointPeriod string into checkpoint intervals - seqCheckpointPeriod = pluginSystem::toTimeSlice(checkpointPeriod); - - for(uint32_t nthSoftRestart = 0; nthSoftRestart <= softRestarts; ++nthSoftRestart) + while(checkpointing.doSoftRestart()) { /* Global offset is updated during the simulation. In case we perform a soft restart we need to reset * the offset here to be valid for the next simulation run. @@ -239,7 +164,7 @@ namespace pmacc movingWindowCheck(currentStep); /* call plugins and dump initial step if simulation starts without restart */ - if(!restartRequested) + if(checkpointing.getRestartState() != simulationControl::RestartState::SUCCESS) { notifyPlugins(currentStep); dumpOneStep(currentStep); @@ -292,42 +217,26 @@ namespace pmacc } // softRestarts loop } - template - void SimulationHelper::pluginRegisterHelp(po::options_description& desc) + template + void SimulationHelper::pluginRegisterHelp(po::options_description& desc) { // clang-format off desc.add_options() ("steps,s", po::value(&runSteps), "Simulation steps") - ("checkpoint.restart.loop", po::value(&softRestarts)->default_value(0), - "Number of times to restart the simulation after simulation has finished (for presentations). " - "Note: does not yet work with all plugins, see issue #1305") ("percent,p", po::value(&progress)->default_value(5), "Print time statistics after p percent to stdout") ("progressPeriod",po::value(&progressPeriod), "write progress [for each n-th step], plugin period syntax can be used here.") - ("checkpoint.restart", po::value(&restartRequested)->zero_tokens(), - "Restart simulation from a checkpoint. Requires a valid checkpoint.") - ("checkpoint.tryRestart", po::value(&tryRestart)->zero_tokens(), - "Try to restart if a checkpoint is available else start the simulation from scratch.") - ("checkpoint.restart.directory", po::value(&restartDirectory)->default_value(restartDirectory), - "Directory containing checkpoints for a restart") - ("checkpoint.restart.step", po::value(&restartStep), - "Checkpoint step to restart from") - ("checkpoint.period", po::value(&checkpointPeriod), - "Period for checkpoint creation [interval(s) based on steps]") - ("checkpoint.timePeriod", po::value(&checkpointPeriodMinutes), - "Time periodic checkpoint creation [period in minutes]") - ("checkpoint.directory", po::value(&checkpointDirectory)->default_value(checkpointDirectory), - "Directory for checkpoints") ("author", po::value(&author)->default_value(std::string("")), "The author that runs the simulation and is responsible for created output files") ("mpiDirect", po::value(&useMpiDirect)->zero_tokens(), "use device direct for MPI communication e.g. GPU direct"); // clang-format on + checkpointing.registerHelp(desc); } - template - void SimulationHelper::pluginLoad() + template + void SimulationHelper::pluginLoad() { Environment<>::get().SimulationDescription().setRunSteps(runSteps); Environment<>::get().SimulationDescription().setAuthor(author); @@ -338,13 +247,10 @@ namespace pmacc seqProgressPeriod = pluginSystem::toTimeSlice(progressPeriod); output = (getGridController().getGlobalRank() == 0); - - if(tryRestart) - restartRequested = true; } - template - void SimulationHelper::checkSignals(uint32_t const currentStep) + template + void SimulationHelper::checkSignals(uint32_t const currentStep) { /* Avoid signal handling if the last signal is still processed. * Signal handling in the first step is always allowed. @@ -420,7 +326,7 @@ namespace pmacc signalCreateCheckpoint = false; // add a new checkpoint - seqCheckpointPeriod.push_back(pluginSystem::Slice(signalMaxTimestep, signalMaxTimestep)); + checkpointing.addCheckpoint(signalMaxTimestep); } if(signalStopSimulation) { @@ -432,8 +338,8 @@ namespace pmacc } } - template - void SimulationHelper::calcProgress() + template + void SimulationHelper::calcProgress() { if(progress == 0 || progress > 100) progress = 100; @@ -444,55 +350,11 @@ namespace pmacc showProgressAnyStep = 1; } - template - void SimulationHelper::writeCheckpointStep(uint32_t const checkpointStep) - { - std::ofstream file; - std::string const checkpointMasterFile = checkpointDirectory + std::string("/") + CHECKPOINT_MASTER_FILE; - - file.open(checkpointMasterFile.c_str(), std::ofstream::app); - - if(!file) - throw std::runtime_error("Failed to write checkpoint master file"); - - file << checkpointStep << std::endl; - file.close(); - } - - template - std::vector SimulationHelper::readCheckpointMasterFile() - { - std::vector checkpoints; - - std::string const checkpointMasterFile - = this->restartDirectory + std::string("/") + this->CHECKPOINT_MASTER_FILE; - - if(!stdfs::exists(checkpointMasterFile)) - return checkpoints; - - std::ifstream file(checkpointMasterFile.c_str()); - - /* read each line */ - std::string line; - while(std::getline(file, line)) - { - if(line.empty()) - continue; - try - { - checkpoints.push_back(boost::lexical_cast(line)); - } - catch(boost::bad_lexical_cast const&) - { - std::cerr << "Warning: checkpoint master file contains invalid data (" << line << ")" << std::endl; - } - } - - return checkpoints; - } // Explicit template instantiation to provide symbols for usage together with PMacc - template class SimulationHelper; - template class SimulationHelper; + template class SimulationHelper>; + template class SimulationHelper>; + template class SimulationHelper>; + template class SimulationHelper>; } // namespace pmacc diff --git a/include/pmacc/simulationControl/SimulationHelper.hpp b/include/pmacc/simulationControl/SimulationHelper.hpp index 1008c4dccb..136311ad52 100644 --- a/include/pmacc/simulationControl/SimulationHelper.hpp +++ b/include/pmacc/simulationControl/SimulationHelper.hpp @@ -27,6 +27,7 @@ #include "pmacc/Environment.hpp" #include "pmacc/mappings/simulation/GridController.hpp" #include "pmacc/pluginSystem/IPlugin.hpp" +#include "pmacc/pluginSystem/Slice.hpp" #include "pmacc/types.hpp" #include @@ -42,7 +43,7 @@ namespace pmacc * * @tparam DIM base dimension for the simulation (2-3) */ - template + template class SimulationHelper : public IPlugin { public: @@ -148,51 +149,13 @@ namespace pmacc /* number of simulation steps to compute */ uint32_t runSteps{0}; - /** Presentations: loop the whole simulation `softRestarts` times from - * initial step to runSteps */ - uint32_t softRestarts; - - /* period for checkpoint creation [interval(s) based on steps]*/ - std::string checkpointPeriod; - - /* checkpoint intervals */ - SeqOfTimeSlices seqCheckpointPeriod; - - /* period for checkpoint creation [period in minutes] - * Zero is disabling time depended checkpointing. - */ - std::uint64_t checkpointPeriodMinutes = 0u; - std::thread checkpointTimeThread; - - // conditional variable to notify all concurrent threads and signal exit of the simulation - std::condition_variable exitConcurrentThreads; - std::mutex concurrentThreadMutex; - - /* common directory for checkpoints */ - std::string checkpointDirectory; - - /* number of checkpoints written */ - uint32_t numCheckpoints{0}; - - /* checkpoint step to restart from */ - int32_t restartStep{-1}; - - /* common directory for restarts */ - std::string restartDirectory; - - /* restart requested */ - bool restartRequested{false}; - - /* filename for checkpoint master file with all checkpoint timesteps */ - std::string const CHECKPOINT_MASTER_FILE; - + CheckpointingClass checkpointing; /* author that runs the simulation */ std::string author; //! enable MPI gpu direct bool useMpiDirect{false}; - bool tryRestart = false; private: /** Largest time step within the simulation (all MPI ranks) */ @@ -213,23 +176,6 @@ namespace pmacc */ void calcProgress(); - - /** - * Append \p checkpointStep to the master checkpoint file - * - * @param checkpointStep current checkpoint step - */ - void writeCheckpointStep(uint32_t const checkpointStep); - - protected: - /** - * Reads the checkpoint master file if any and returns all found checkpoint steps - * - * @return vector of found checkpoints steps in order they appear in the file - */ - std::vector readCheckpointMasterFile(); - - private: bool output = false; uint16_t progress;