diff --git a/.codespellignore b/.codespellignore index 2df21c0..89fc3e0 100644 --- a/.codespellignore +++ b/.codespellignore @@ -1,2 +1,3 @@ cachable -parma \ No newline at end of file +parma +mater \ No newline at end of file diff --git a/.codespellrc b/.codespellrc index bbc2c8b..c7e03fd 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -skip = ./docs/_build,./docs/_static,./docs/31_sparta/*.sh -ignore-words-list = fom +skip = ./docs/_build,./docs/_static,./docs/31_sparta/*.sh,./docs/32_lammpsACE/*.sh +ignore-words-list = fom,Mater diff --git a/.gitignore b/.gitignore index a8069db..eb9e77c 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,17 @@ output.log lisp +# LAMMPS +docs/32_lammpsACE/lammps +docs/32_lammpsACE/kokkos-tools +docs/32_lammpsACE/lammps_build.log +docs/32_lammpsACE/kokkos_tools_build.log +docs/32_lammpsACE/templatedir/kokkos_tools_env_elcapitan.sh +docs/32_lammpsACE/templatedir/Cu-PBE-core-rep.ace +docs/32_lammpsACE/templatedir/lammps_env_elcapitan.sh +docs/32_lammpsACE/templatedir/lammps +docs/32_lammpsACE/check* + \#*# .[#]*[#] diff --git a/docs/31_sparta/sparta.rst b/docs/31_sparta/sparta.rst index 8dfb5a9..30e2690 100644 --- a/docs/31_sparta/sparta.rst +++ b/docs/31_sparta/sparta.rst @@ -426,12 +426,12 @@ A script (``sparta_clone.sh``) is provided to clone the SPARTA repository within the "sparta" folder. Instructions are provided on how to build SPARTA for the following systems: -* Generic (see :ref:`BuildGeneric`) +* Generic (see :ref:`BuildSpartaGeneric`) * Advanced Technology System 4 (ATS-4), also known as El Capitan (see - :ref:`BuildATS4`) + :ref:`BuildSpartaATS4`) -.. _BuildGeneric: +.. _BuildSpartaGeneric: Generic ------- @@ -440,7 +440,7 @@ Refer to SPARTA's [sparta-build]_ documentation for generic instructions. -.. _BuildATS4: +.. _BuildSpartaATS4: El Capitan ---------- @@ -547,12 +547,12 @@ Additional information: Single-node results from SPARTA are provided on the following systems: * Advanced Technology System 4 (ATS-4), also known as El Capitan (see - :ref:`ResultsATS4`) + :ref:`ResultsSpartaATS4`) Multi-node results from SPARTA are provided on the following system(s): * Advanced Technology System 4 (ATS-4), also known as El Capitan (see - :ref:`ResultsScaleATS4`) + :ref:`ResultsSpartaScaleATS4`) .. _SPARTAComputeFOM: @@ -582,7 +582,7 @@ example were unnecessary. INFO - 2026-02-16 20:54:44,673 - File = /path/to/llnl-benchmarks/docs/31_sparta/checks-10--nodes-001--L-2.0--ktst/log.sparta -.. _ResultsATS4: +.. _ResultsSpartaATS4: El Capitan - Single Node ------------------------ @@ -610,7 +610,7 @@ as part of the output. INFO - 2026-02-16 20:54:44,673 - File = /path/to/llnl-benchmarks/docs/31_sparta/checks-10--nodes-001--L-2.0--ktst/log.sparta -.. _ResultsScaleATS4: +.. _ResultsSpartaScaleATS4: El Capitan - Many Nodes ----------------------- diff --git a/docs/32_lammpsACE/kokkos_tools_build_elcapitan.sh b/docs/32_lammpsACE/kokkos_tools_build_elcapitan.sh new file mode 100755 index 0000000..cad3f00 --- /dev/null +++ b/docs/32_lammpsACE/kokkos_tools_build_elcapitan.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +# set top-level script parameters +umask 022 +set -e +set -x + +# create vars for common directories and files +dir_root="`git rev-parse --show-toplevel`" +dir_pwd="` pwd -P `" +dir_src="${dir_pwd}/kokkos-tools/profiling/space-time-stack" +dir_build="${dir_pwd}/kokkos-tools/profiling/space-time-stack" +file_log="${dir_pwd}/kokkos_tools_build.log" + +# redirect STDOUT and STDERR through tee +exec &> >(tee >(ts '[%Y-%m-%d %H:%M:%S]' > "${file_log}")) + +# let's turn on verbosity now +set -v + +# output for posterity +hostname +uptime +lscpu + +# clean and reset source +pushd "${dir_src}" +git clean -fdx +git reset --hard +popd + +# create build directory +# test -d "${dir_build}" && rm -rf "${dir_build}" +# mkdir -p "${dir_build}" + +# build +# list current environment +module list +# alter environment +. lammps_env_elcapitan.sh +# list current environment +module list +pushd "${dir_build}" +/usr/bin/time --verbose -- \ + nice -n 1 \ + gmake CXX=CC +popd + +# gracefully exit +exit 0 diff --git a/docs/32_lammpsACE/kokkos_tools_clone.sh b/docs/32_lammpsACE/kokkos_tools_clone.sh new file mode 100755 index 0000000..22e8521 --- /dev/null +++ b/docs/32_lammpsACE/kokkos_tools_clone.sh @@ -0,0 +1,2 @@ +#!/bin/sh +git clone git@github.com:kokkos/kokkos-tools.git diff --git a/docs/32_lammpsACE/kokkos_tools_env_elcapitan.sh b/docs/32_lammpsACE/kokkos_tools_env_elcapitan.sh new file mode 100644 index 0000000..6596b09 --- /dev/null +++ b/docs/32_lammpsACE/kokkos_tools_env_elcapitan.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +dir_root="`git rev-parse --show-toplevel`" +export KOKKOS_TOOLS_LIBS="${dir_root}/docs/32_lammpsACE/kokkos-tools/profiling/space-time-stack/kp_space_time_stack.so" diff --git a/docs/32_lammpsACE/lammpsACE.rst b/docs/32_lammpsACE/lammpsACE.rst index f398eb7..7beec84 100644 --- a/docs/32_lammpsACE/lammpsACE.rst +++ b/docs/32_lammpsACE/lammpsACE.rst @@ -2,55 +2,557 @@ LAMMPS ACE ********** -https://docs.lammps.org/pair_pace.html +.. note:: + The documentation herein needs to be updated for current + performance. + +This is the documentation for the benchmark [LAMMPS]_, specifically +KOKKOS-LAMMPS (see [KOKKOS-LAMMPS]_). The content herein was created +by the following authors (in alphabetical order). + +- `Anthony M. Agelastos `_ +- `Stan Moore `_ + +This material is based upon work supported by the Sandia National +Laboratories (SNL), a multimission laboratory managed and operated by +National Technology and Engineering Solutions of Sandia under the +U.S. Department of Energy's National Nuclear Security Administration +under contract DE-NA0003525. Content herein considered unclassified +with unlimited distribution under SAND2023-01070O. Purpose ======= +Heavily pulled from their [lammps-site]_: + + LAMMPS is a classical molecular dynamics code with a focus on + materials modeling. It's an acronym for Large-scale + Atomic/Molecular Massively Parallel Simulator. LAMMPS has + potentials for solid-state materials (metals, semiconductors) and + soft matter (biomolecules, polymers) and coarse-grained or + mesoscopic systems. It can be used to model atoms or, more + generically, as a parallel particle simulator at the atomic, meso, + or continuum scale. LAMMPS runs on single processors or in parallel + using message-passing techniques and a spatial-decomposition of the + simulation domain. Many of its models have versions that provide + accelerated performance on CPUs, GPUs, and Intel Xeon Phis. The + code is designed to be easy to modify or extend with new + functionality. + Characteristics =============== -Problems --------- +The goal is to utilize the specified version of LAMMPS (see +:ref:`LAMMPSApplicationVersion`) that runs the benchmark problem (see +:ref:`LAMMPSProblem`) correctly (see :ref:`LAMMPSCorrectness` if +changes are made to LAMMPS). + + +.. _LAMMPSApplicationVersion: + +Application Version +------------------- + +The command to clone is provided below. + +.. literalinclude:: lammps_clone.sh + :language: sh + :lines: 2- + +.. note:: + The Git SHA will be updated with a tag soon. + +The script to clone can be downloaded from :download:`lammps_clone.sh +`. It can also be executed in place to clone into +``docs/32_lammpsACE/lammps``. + +.. code-block:: bash + + cd docs/32_lammpsACE + ./lammps_clone.sh + + +.. _LAMMPSProblem: + +Problem +------- + +This problem runs an ACE (atomic cluster expansion) machine-learned +potential for a copper crystal using a face-entered cubic (fcc) +lattice at 300 K. Please refer to [pace-site]_ and [pace-article]_ for +more information. + +This problem is *mostly* present within the upstream LAMMPS +repository. The components of this problem are listed below (paths +given are within LAMMPS repository). Each of these files will need to +be copied into a run directory for the simulation. + +``examples/PACKAGES/pace/Cu-PBE-core-rep.ace`` + This is an input needed for the simulation. + +``examples/PACKAGES/pace/in.pace.product`` This is the default input + file that controls the simulation. Some parameters within this file + may need to be changed depending upon what is being run (i.e., + these parameters control how much memory it uses). The modified + version of this within the template directory should be preferred; + more on this below. + +A template run directory was created to help ease performing a +simulation; this directory is ``templatedir``. There are some key +files within it. + +``templatedir/in.pace.product`` + This is a modified version of the input file with some key + parameters changed to be more appropriate as a benchmark. It is + designed to run for approximately 11 minutes in 2 phases of 5.5 + minutes each. SPARTA already directly computes the FOM and outputs + it for each of the phases. This second phase of 5.5 minutes is the + FOM that is to be tracked. + +``templatedir/lammps_ln.sh`` + This file creates symbolic links to files and folders needed for + the simulation. + +``templatedir/lammps_batch_elcapitan.sh`` + This is a batch script compatible with El Capitan. It has + capabilities for setting key job parameters from the command line; + more on that below. + + +An excerpt from this input file that has its key parameters is +provided below. + +.. code-block:: + :emphasize-lines: 2,7 + + + variable L index 64.0 + region box block 0 ${L} 0 ${L} 0 ${L} + + pair_style pace product chunksize 49152 + + thermo 10 + thermo_style custom step cpu temp epair etotal press v_delenergy v_delpress + + ################################## + ### Benchmarking modifications ### + ################################## + + # Add a thermostat to keep temperature from falling + variable tdamp equal $(dt) + fix mynvt all nvt temp 300.0 300.0 ${tdamp} + + # Some systems buffer extensively + thermo_modify flush yes + + # Print out the value of L for parsing ease + print "The value of L is $L" + + ### Throw out first 5 minutes for hardware equilibrium + + # Stop after 5.5 minutes + fix 2 all halt 10 tlimit > 330.0 message no error continue + run 10000000 + + ### Run another 5 minutes for final FOM + unfix 2 + + # Stop after 5.5 minutes + fix 3 all halt 10 tlimit > 330.0 message no + run 10000000 + +These parameters are described below. + +``L`` + This corresponds to the **l**\ ength scale factor. This will scale + the dimensions of the problem. + +``thermo`` + Compute and print thermodynamic info (e.g., temperature, energy, + pressure) on timesteps that are a multiple of this parameter and at + the beginning and end of a simulation. + +This problem exhibits different runtime characteristics whether or not +Kokkos is enabled. Specifically, there is some work that is performed +within Kokkos that helps to keep this problem as well behaved from a +throughput perspective as possible. Ergo, Kokkos must be enabled for +the simulations regardless of the hardware being used (the cases +herein have configurations that enable it for reference). + Figure of Merit --------------- +Each LAMMPS simulation writes out a file named "log.lammps". At the +end of this simulation is a block that resembles the following +example. + +.. code-block:: + :emphasize-lines: 11 + + Step CPU Temp E_pair TotEng Press v_delenergy v_delpress + 640 0 299.7264 -3834241 -3793616.4 62562.774 -3.7252903e-08 4.8748916e-10 + 650 5.1882405 300.1416 -3834085.9 -3793405 62656.487 3.7252903e-08 2.2555469e-10 + 660 10.389581 300.04536 -3834003.9 -3793336 62705.836 -1.4901161e-08 2.910383e-11 + + 1260 323.38353 300.55705 -3834187.5 -3793450.4 62842.117 9.778887e-09 1.5279511e-10 + 1270 328.58739 300.25528 -3834141.7 -3793445.4 62861.607 1.0244548e-08 -5.0931703e-10 + 1280 333.79045 300.1357 -3834154.7 -3793474.6 62856.262 -1.1641532e-08 1.6734703e-10 + Loop time of 333.812 on 4 procs for 640 steps with 1048576 atoms + + Performance: 0.083 ns/day, 289.767 hours/ns, 1.917 timesteps/s, 2.010 Matom-step/s + 45.1% CPU use with 4 MPI tasks x 1 OpenMP threads + +The quantity of interest (QOI) is "Mega atom steps per second," which +is directly computed as ``Matom-step/s`` in the example above. + +It is desired to capture the FOM for varying problem sizes that +encompass utilizing 50% to 80% of available memory (when all PEs are +utilized). The ultimate goal is to maximize this throughput FOM while +utilizing at least 50% of available memory. + + +.. _LAMMPSCorrectness: + +Correctness +----------- + +The aforementioned relevant block of output within "log.lammps" is +replicated below. + +.. code-block:: + :emphasize-lines: 2,3,4,6,7,8 + + Step CPU Temp E_pair TotEng Press v_delenergy v_delpress + 640 0 299.7264 -3834241 -3793616.4 62562.774 -3.7252903e-08 4.8748916e-10 + 650 5.1882405 300.1416 -3834085.9 -3793405 62656.487 3.7252903e-08 2.2555469e-10 + 660 10.389581 300.04536 -3834003.9 -3793336 62705.836 -1.4901161e-08 2.910383e-11 + + 1260 323.38353 300.55705 -3834187.5 -3793450.4 62842.117 9.778887e-09 1.5279511e-10 + 1270 328.58739 300.25528 -3834141.7 -3793445.4 62861.607 1.0244548e-08 -5.0931703e-10 + 1280 333.79045 300.1357 -3834154.7 -3793474.6 62856.262 -1.1641532e-08 1.6734703e-10 + Loop time of 333.812 on 4 procs for 640 steps with 1048576 atoms + + Performance: 0.083 ns/day, 289.767 hours/ns, 1.917 timesteps/s, 2.010 Matom-step/s + 45.1% CPU use with 4 MPI tasks x 1 OpenMP threads + +There are several columns of interest regarding correctness; these are +listed below. + +``Step`` + This is the step number and is the first column. + +``Temp`` + This tracks the temperature aspect of the simulation. + +``Press`` + This tracks the pressure aspect of the simulation. + +Assessing the correctness will involve comparing these quantities +across modified (henceforth denoted with "mod" subscript) and +unmodified ("unmod" subscript) LAMMPS subject to the methodology +below. + +The **first** step is to adjust the ``thermo`` parameter +to a value of 1 so fine-grained output is generated; if this is +significantly slowing down computation, then it can be increased to a +value of 10. Then, produce output from LAMMPS\ :sub:`unmod` with the +same settings. + +The **second** step is to compute the absolute differences between +modified and unmodified LAMMPS for ``Temp`` and ``Press`` for each +row, *i*, whose ``Step`` is relevant for the FOM for LAMMPS\ +:sub:`mod`, + +.. math:: + \Delta \texttt{Temp}_i &= | \texttt{Temp}_{\textrm{mod},i}-\texttt{Temp}_{\textrm{unmod},i} | \\ + \Delta \texttt{Press}_i &= | \texttt{Press}_{\textrm{mod},i}-\texttt{Press}_{\textrm{unmod},i} | \\ + +where + +* *i* is each line whose ``CPU`` time is part of the second phase for LAMMPS\ :sub:`mod` + +The **third** step is to compute the arithmetic mean of each of the +aforementioned quantities over the *n* rows, + +.. math:: + \mu _{\Delta \texttt{Temp}} &= \frac{\sum_{i} \Delta \texttt{Temp}_i}{n} \\ + \mu _{\Delta \texttt{Press}} &= \frac{\sum_{i} \Delta \texttt{Press}_i}{n} \\ + +where + +.. math:: + n = \sum_{i} 1 + +The **fourth** step is to compute the arithmetic mean of the *n* +matching rows of the unmodified LAMMPS, + +.. math:: + \mu _{\texttt{Temp},\textrm{unmod}} &= \frac{\sum_{i} \texttt{Temp}_{\textrm{unmod},i}}{n} \\ + \mu _{\texttt{Press},\textrm{unmod}} &= \frac{\sum_{i} \texttt{Press}_{\textrm{unmod},i}}{n} \\ + +The **fifth** step is to normalize the differences with the baseline +values to create the error ratios, + +.. math:: + \varepsilon _{\texttt{Temp}} &= \frac{\mu _{\Delta \texttt{Temp}}}{\mu _{\texttt{Temp},\textrm{unmod}}} \\ + \varepsilon _{\texttt{Press}} &= \frac{\mu _{\Delta \texttt{Press}}}{\mu _{\texttt{Press},\textrm{unmod}}} \\ + +The **sixth** and final step is to check over all of the error ratios +and if any of them exceed 5%, then the modifications are not approved +without discussing them with this benchmark's authors. The success +criteria are: + +.. math:: + \varepsilon _{\texttt{Temp}} &\le 5\% \\ + \varepsilon _{\texttt{Press}} &\le 5\% -Source code modifications + +Source Code Modifications ========================= -Please see :ref:`GlobalRunRules` for general guidance on allowed modifications. +Please see :ref:`GlobalRunRules` for general guidance on allowed +modifications. + + +System Information +================== + +The platforms utilized for benchmarking activities are listed and +described below. + +* Advanced Technology System 4 (ATS-4), also known as El Capitan (see + :ref:`ElCapitanSystemDescription`) + Building ======== +A script (``lammps_clone.sh``) is provided to clone the LAMMPS +repository within the "lammps" folder. Instructions are provided on +how to build LAMMPS for the following systems: + +* Generic (see :ref:`BuildLammpsGeneric`) +* Advanced Technology System 4 (ATS-4), also known as El Capitan (see + :ref:`BuildLammpsATS4`) + + +.. _BuildLammpsGeneric: + +Generic +------- + +Refer to LAMMP's [lammps-build]_ documentation for generic +instructions. + + +.. _BuildLammpsATS4: + +El Capitan +---------- + +Instructions for building on El Capitan are provided below. These +instructions assume this repository has been cloned and that the +current working directory is at the top level of this repository. + +.. code-block:: bash + + cd docs/32_lammpsACE + ./lammps_build_elcapitan.sh + +The script discussed above is :download:`lammps_build_elcapitan.sh +` and is produced below for convenience and +reference. + +.. literalinclude:: lammps_build_elcapitan.sh + :language: bash + Running ======= +Instructions are provided on how to run LAMMPS for the following +systems: -Validation -========== +* Advanced Technology System 4 (ATS-4), also known as El Capitan (see + :ref:`LAMMPSRunATS4`) + * Profiling with Kokkos Tools on El Capitan (see + :ref:`LAMMPSProfileKokkosToolsElCapitan`) + + +.. _LAMMPSRunATS4: + +El Capitan +---------- + +.. note:: + + This section will be updated with some more content soon. + +An example for performing simulations on El Capitan is +provided below. + +.. code-block:: bash + + # first, copy templatedir into something useful + cp -a templatedir useful + + # next, go into the run folder + cd useful + + # submit job and set parameters on command line if desired + # this example sets L (aka lammps_len) to 64 + # this example turns on Kokkos Tools profiling (aka kokkos_tools) + # this example runs on 1 node (aka --nodes=1) + lammps_len=64 is_kokkos_tools=1 flux batch --nodes=1 lammps_batch_elcapitan.sh + + +.. _SPARTAProfileKokkosTools: + +Profiling with Kokkos Tools +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Scripts are provided to clone and build Kokkos Tools. The steps to do +both are provided below. -Example Scalability Results -=========================== +.. code-block:: bash + # go into the LAMMPS documentation folder + cd docs/32_lammpsACE -Memory Usage -============ + # clone Kokkos Tools + ./kokkos_tools_clone.sh + # build Kokkos Tools' Space Time + ./kokkos_tools_build_elcapitan.sh -Strong Scaling on El Capitan -============================ +Once built, the command line variable ``is_kokkos_tools`` can be set +to ``1`` for the batch script to turn it on. After a successful run, +it will output additional memory information. An example of this (for +``L`` equal to 64) on El Capitan is provided below that shows +approximately 99.6 GB of memory allocated on each GPU. +.. code-block:: -Weak Scaling on El Capitan -========================== + KOKKOS HIP SPACE: + =================== + MAX MEMORY ALLOCATED: 99615719.6 kB + + +.. _SPARTAResults: + +Verification of Results +======================= + +Additional information: + +* The sub-section :ref:`LAMMPSComputeFOM` describes how to compute the + FOM + +Single-node results from LAMMPS are provided on the following systems: + +* Advanced Technology System 4 (ATS-4), also known as El Capitan (see + :ref:`ResultsLammpsATS4`) + +Multi-node results from SPARTA are provided on the following system(s): + +* Advanced Technology System 4 (ATS-4), also known as El Capitan (see + :ref:`ResultsLammpsScaleATS4`) + + +.. _LAMMPSComputeFOM: + +Compute Figure of Merit +----------------------- + +The figure of merit (FOM) is automatically computed by LAMMPS. The +benchmark run is broken into two phases; extract the FOM from the last +phase. The relevant excerpt from the "log.lammps" output is below. + +.. code-block:: + :emphasize-lines: 11 + + Step CPU Temp E_pair TotEng Press v_delenergy v_delpress + 640 0 299.7264 -3834241 -3793616.4 62562.774 -3.7252903e-08 4.8748916e-10 + 650 5.1882405 300.1416 -3834085.9 -3793405 62656.487 3.7252903e-08 2.2555469e-10 + 660 10.389581 300.04536 -3834003.9 -3793336 62705.836 -1.4901161e-08 2.910383e-11 + + 1260 323.38353 300.55705 -3834187.5 -3793450.4 62842.117 9.778887e-09 1.5279511e-10 + 1270 328.58739 300.25528 -3834141.7 -3793445.4 62861.607 1.0244548e-08 -5.0931703e-10 + 1280 333.79045 300.1357 -3834154.7 -3793474.6 62856.262 -1.1641532e-08 1.6734703e-10 + Loop time of 333.812 on 4 procs for 640 steps with 1048576 atoms + + Performance: 0.083 ns/day, 289.767 hours/ns, 1.917 timesteps/s, 2.010 Matom-step/s + 45.1% CPU use with 4 MPI tasks x 1 OpenMP threads + +The FOM is the quantity ``Matom-step/s``, which in this example is 2.010. + + +.. _ResultsLammpsATS4: + +El Capitan - Single Node +------------------------ + +.. note:: + + This section will be updated with some more content soon. + +A single-node example is below that showcases 2.010 Mega atom +steps per second per node. The other relevant parameters are displayed +as part of the output. + +.. code-block:: + :emphasize-lines: 11 + + Step CPU Temp E_pair TotEng Press v_delenergy v_delpress + 640 0 299.7264 -3834241 -3793616.4 62562.774 -3.7252903e-08 4.8748916e-10 + 650 5.1882405 300.1416 -3834085.9 -3793405 62656.487 3.7252903e-08 2.2555469e-10 + 660 10.389581 300.04536 -3834003.9 -3793336 62705.836 -1.4901161e-08 2.910383e-11 + + 1260 323.38353 300.55705 -3834187.5 -3793450.4 62842.117 9.778887e-09 1.5279511e-10 + 1270 328.58739 300.25528 -3834141.7 -3793445.4 62861.607 1.0244548e-08 -5.0931703e-10 + 1280 333.79045 300.1357 -3834154.7 -3793474.6 62856.262 -1.1641532e-08 1.6734703e-10 + Loop time of 333.812 on 4 procs for 640 steps with 1048576 atoms + + Performance: 0.083 ns/day, 289.767 hours/ns, 1.917 timesteps/s, 2.010 Matom-step/s + 45.1% CPU use with 4 MPI tasks x 1 OpenMP threads + + +.. _ResultsLammpsScaleATS4: + +El Capitan - Many Nodes +----------------------- + +.. note:: + + This section will be updated with some more content soon. References ========== + +.. [LAMMPS] LAMMPS - a flexible simulation tool for particle-based + materials modeling at the atomic, meso, and continuum scales, + A. P. Thompson, H. M. Aktulga, R. Berger, D. S. Bolintineanu, + W. M. Brown, P. S. Crozier, P. J. in't Veld, A. Kohlmeyer, + S. G. Moore, T. D. Nguyen, R. Shan, M. J. Stevens, J. Tranchida, + C. Trott, S. J. Plimpton, Comp Phys Comm, 271 (2022) 10817. +.. [lammps-site] LAMMPS Developers, 'LAMMPS Molecular Dynamics Simulator', 2026. + [Online]. Available: https://lammps.org. [Accessed: 15- Feb- 2026] +.. [lammps-build] LAMMPS Developers, 'LAMMPS Documentation', 2026. + [Online]. Available: https://dics.lammps.org/Manual.html. + [Accessed: 15- Feb- 2026] +.. [pace-site] LAMMPS Developers, 'pair_style pace command - LAMMPS Documentation', 2026. + [Online]. Available: https://docs.lammps.org/pair_pace.html#description +.. [pace-article] Lysogorskiy, Y., Oord, C.v.d., Bochkarev, A. et al., + Performant implementation of the atomic cluster expansion (PACE) + and application to copper and silicon. NPJ Comput. Mater. 7, 97 (2021). # codespell:ignore + https://doi.org/10.1038/s41524-021-00559-9 +.. [KOKKOS-LAMMPS] Anders Johansson, Evan Weinberg, Christian Trott, Megan McCarthy, and Stan Moore. + 2025. LAMMPS-KOKKOS: Performance Portable Molecular Dynamics Across Exascale Architectures. + In Proceedings of the SC '25 Workshops of the International Conference for High Performance + Computing, Networking, Storage and Analysis (SC Workshops '25). + Association for Computing Machinery, New York, NY, USA, 1217–1232. + https://doi.org/10.1145/3731599.3767498 diff --git a/docs/32_lammpsACE/lammps_build_elcapitan.sh b/docs/32_lammpsACE/lammps_build_elcapitan.sh new file mode 100755 index 0000000..30fd4f5 --- /dev/null +++ b/docs/32_lammpsACE/lammps_build_elcapitan.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +# set top-level script parameters +umask 022 +set -e +set -x + +# create vars for common directories and files +dir_root="`git rev-parse --show-toplevel`" +dir_pwd="` pwd -P `" +dir_src="${dir_pwd}/lammps" +dir_build="${dir_pwd}/lammps/_build" +file_log="${dir_pwd}/lammps_build.log" + +# redirect STDOUT and STDERR through tee +exec &> >(tee >(ts '[%Y-%m-%d %H:%M:%S]' > "${file_log}")) + +# let's turn on verbosity now +set -v + +# output for posterity +hostname +uptime +lscpu + +# clean and reset source +pushd "${dir_src}" +git clean -fdx +git reset --hard +popd + +# create build directory +test -d "${dir_build}" && rm -rf "${dir_build}" +mkdir -p "${dir_build}" + +# build +# list current environment +module list +# alter environment +. lammps_env_elcapitan.sh +# list current environment +module list +pushd "${dir_build}" +cmake \ + -C ../cmake/presets/elcapitan_kokkos.cmake \ + -DPKG_ML-PACE=on \ + -DBUILD_MPI=on \ + -D CMAKE_BUILD_TYPE=Release \ + ../cmake +/usr/bin/time --verbose -- \ + nice -n 1 \ + gmake -j 64 +popd + +# gracefully exit +exit 0 diff --git a/docs/32_lammpsACE/lammps_clone.sh b/docs/32_lammpsACE/lammps_clone.sh new file mode 100755 index 0000000..b4879a8 --- /dev/null +++ b/docs/32_lammpsACE/lammps_clone.sh @@ -0,0 +1,4 @@ +#!/bin/sh +git clone git@github.com:lammps/lammps.git +cd lammps +git checkout a51f9ba0e719be544293987bb3cbd9939f1b01ee diff --git a/docs/32_lammpsACE/lammps_env_elcapitan.sh b/docs/32_lammpsACE/lammps_env_elcapitan.sh new file mode 100644 index 0000000..5dad5e9 --- /dev/null +++ b/docs/32_lammpsACE/lammps_env_elcapitan.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +module load craype-accel-amd-gfx942 +module load PrgEnv-cray +module load rocm/6.2.1 +module load python + +export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} + +export MPICH_GPU_SUPPORT_ENABLED=1 +export MPICH_OFI_NIC_POLICY=GPU + +### FIXME ### Need a system wide install of libfabric from SHS 11 (or newer) +export LD_LIBRARY_PATH=/usr/workspace/wsb/accept/packages-2024/SHS11_lib:${LD_LIBRARY_PATH} + +export HIP_PATH=`hipconfig -p` +export LD_LIBRARY_PATH=${HIP_PATH}/lib:${LD_LIBRARY_PATH} + +### Tell libfabric to only look for the ROCm runtime, not cuda, etc. +export FI_HMEM="rocr" + +# Have malloc() calls use huge pages +export HUGETLB_MORECORE=yes + +# restrict libhugetlbfs to be enabled for these executables only: +export HUGETLB_RESTRICT_EXE="defrag:lmp" + +export HSA_XNACK=1 diff --git a/docs/32_lammpsACE/templatedir/in.pace.product b/docs/32_lammpsACE/templatedir/in.pace.product new file mode 100644 index 0000000..c8d6dee --- /dev/null +++ b/docs/32_lammpsACE/templatedir/in.pace.product @@ -0,0 +1,61 @@ +# simple test of fcc Cu with ACE product + +units metal +atom_style atomic + +neighbor 0.3 bin +neigh_modify every 2 delay 10 check yes + +variable a equal 3.597 +lattice fcc $a +variable L index 64.0 +region box block 0 ${L} 0 ${L} 0 ${L} +create_box 1 box +create_atoms 1 box + +mass 1 26.98 + +pair_style pace product chunksize 49152 +pair_coeff * * Cu-PBE-core-rep.ace Cu + +velocity all create 300 8728 loop geom +timestep 0.0005 +fix 1 all nve + +compute eatom all pe/atom +compute energy all reduce sum c_eatom +variable delenergy equal c_energy-pe + +compute satom all stress/atom NULL +compute str all reduce sum c_satom[1] c_satom[2] c_satom[3] +variable delpress equal -(c_str[1]+c_str[2]+c_str[3])/(3*vol)-press + +thermo 10 +thermo_style custom step cpu temp epair etotal press v_delenergy v_delpress + +################################## +### Benchmarking modifications ### +################################## + +# Add a thermostat to keep temperature from falling +variable tdamp equal $(dt) +fix mynvt all nvt temp 300.0 300.0 ${tdamp} + +# Some systems buffer extensively +thermo_modify flush yes + +# Print out the value of L for parsing ease +print "The value of L is $L" + +### Throw out first 5 minutes for hardware equilibrium + +# Stop after 5.5 minutes +fix 2 all halt 10 tlimit > 330.0 message no error continue +run 10000000 + +### Run another 5 minutes for final FOM +unfix 2 + +# Stop after 5.5 minutes +fix 3 all halt 10 tlimit > 330.0 message no +run 10000000 diff --git a/docs/32_lammpsACE/templatedir/lammps_batch_elcapitan.sh b/docs/32_lammpsACE/templatedir/lammps_batch_elcapitan.sh new file mode 100755 index 0000000..6ffdf96 --- /dev/null +++ b/docs/32_lammpsACE/templatedir/lammps_batch_elcapitan.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +#flux: # --nodes=1 +#flux: -u +#flux: --exclusive +#flux: -q pbatch +#flux: -t 20 +#flux: --job-name=lammps-fcr-fy30 +#flux: --setattr=thp=always +#flux: --setattr=hugepages=512GB + +# e.g., to set the L parameter to a different value: lammps_len=2.0 flux batch lammps_batch.sh +# e.g., to turn on Kokkos Tools Space Time: is_kokkos_tools=1 flux batch lammps_batch.sh + +# define runtime params +lammps_len=${lammps_len:-1} +is_kokkos_tools=${is_kokkos_tools:-0} +flux_job_nodes=${flux_job_nodes:-`flux resource list -s up -no {nnodes}`} +echo "lammps_len=${lammps_len}" +echo "is_kokkos_tools=${is_kokkos_tools}" +echo "flux_job_nodes=${flux_job_nodes}" + +# define useful locations +dir_base="` pwd -P `" + +# set up environment appropriately +. lammps_env_elcapitan.sh +test ${is_kokkos_tools} -eq 1 && . kokkos_tools_env_elcapitan.sh + +# run on 4 GPUs per node +flux run \ + -u \ + --exclusive \ + --verbose \ + -N ${flux_job_nodes} \ + -n $((4 * flux_job_nodes)) \ + -x \ + -c 24 \ + -o cpu-affinity=off \ + -o gpu-affinity=off \ + -o mpibind=on,smt:1,verbose:0 \ + "${dir_base}/lammps/_build/lmp" \ + -sf kk -k on g 1 -pk kokkos neigh half newton on \ + -in in.pace.product \ + -var L ${lammps_len} diff --git a/docs/32_lammpsACE/templatedir/lammps_ln.sh b/docs/32_lammpsACE/templatedir/lammps_ln.sh new file mode 100755 index 0000000..3a85645 --- /dev/null +++ b/docs/32_lammpsACE/templatedir/lammps_ln.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +set -v + +test ! -e lammps_env_elcapitan.sh && ln -s ../lammps_env_elcapitan.sh +test ! -e kokkos_tools_env_elcapitan.sh && ln -s ../kokkos_tools_env_elcapitan.sh +test ! -e lammps && ln -s ../lammps +test ! -e Cu-PBE-core-rep.ace && ln -s ../lammps/examples/PACKAGES/pace/Cu-PBE-core-rep.ace + +exit 0