Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds support for Corona to the llnl-cluster system #519

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion systems/llnl-cluster/externals/base/00-packages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ packages:
hwloc:
buildable: false
externals:
- spec: hwloc@2.9.1
- spec: hwloc@2.11.1
prefix: /usr
fftw:
buildable: false
Expand Down
144 changes: 142 additions & 2 deletions systems/llnl-cluster/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,22 @@

id_to_resources = {
"ruby": {
"scheduler": "slurm",
"sys_cores_per_node": 56,
},
"magma": {
"scheduler": "slurm",
"sys_cores_per_node": 96,
},
"dane": {
"scheduler": "slurm",
"sys_cores_per_node": 112,
},
"corona": {
"scheduler": "flux",
"sys_cores_per_node": 96,
"sys_gpus_per_node": 8,
},
}


Expand All @@ -26,7 +34,7 @@ class LlnlCluster(System):
variant(
"cluster",
default="ruby",
values=("ruby", "magma", "dane"),
values=("ruby", "magma", "dane", "corona"),
description="Which cluster to run on",
)

Expand All @@ -48,11 +56,18 @@ class LlnlCluster(System):
default="intel-oneapi-mkl",
description="Which blas to use",
)

variant(
"rocm",
default="6.0.2",
values=("5.4.3", "5.5.1", "5.6.1", "5.7.1", "6.0.2"),
when="cluster=corona",
description="ROCm version",
)

def initialize(self):
super().initialize()

self.scheduler = "slurm"
attrs = id_to_resources.get(self.spec.variants["cluster"][0])
for k, v in attrs.items():
setattr(self, k, v)
Expand All @@ -65,18 +80,126 @@ def generate_description(self, output_dir):
with open(sw_description, "w") as f:
f.write(self.sw_description())

def rocm_config(self, rocm_version):
template = """\
packages:
hipfft:
externals:
- spec: hipfft@{x}
prefix: /opt/rocm-{x}
rocfft:
externals:
- spec: rocfft@{x}
prefix: /opt/rocm-{x}
rocprim:
externals:
- spec: rocprim@{x}
prefix: /opt/rocm-{x}
rocrand:
externals:
- spec: rocrand@{x}
prefix: /opt/rocm-{x}/hiprand
rocsparse:
externals:
- spec: rocsparse@{x}
prefix: /opt/rocm-{x}
rocthrust:
externals:
- spec: rocthrust@{x}
prefix: /opt/rocm-{x}
hip:
externals:
- spec: hip@{x}
prefix: /opt/rocm-{x}
hsa-rocr-dev:
externals:
- spec: hsa-rocr-dev@{x}
prefix: /opt/rocm-{x}
comgr:
externals:
- spec: comgr@{x}
prefix: /opt/rocm-{x}/
hipsparse:
externals:
- spec: hipsparse@{x}
prefix: /opt/rocm-{x}
hipblas:
externals:
- spec: hipblas@{x}
prefix: /opt/rocm-{x}/
hsakmt-roct:
externals:
- spec: hsakmt-roct@{x}
prefix: /opt/rocm-{x}/
roctracer-dev-api:
externals:
- spec: roctracer-dev-api@{x}
prefix: /opt/rocm-{x}/
rocminfo:
externals:
- spec: rocminfo@{x}
prefix: /opt/rocm-{x}/
llvm:
externals:
- spec: [email protected]
prefix: /opt/rocm-{x}/llvm
llvm-amdgpu:
externals:
- spec: llvm-amdgpu@{x}
prefix: /opt/rocm-{x}/llvm
rocblas:
externals:
- spec: rocblas@{x}
prefix: /opt/rocm-{x}
rocsolver:
externals:
- spec: rocsolver@{x}
prefix: /opt/rocm-{x}
"""
return template.format(x=rocm_version)

def external_pkg_configs(self):
externals = LlnlCluster.resource_location / "externals"

selections = [externals / "base" / "00-packages.yaml"]

if self.spec.satisfies("cluster=corona"):
rocm_cfg_path = self.next_adhoc_cfg()
with open(rocm_cfg_path, "w") as f:
f.write(self.rocm_config(self.spec.variants["rocm"][0]))
selections.append(rocm_cfg_path)

if self.spec.satisfies("compiler=gcc"):
selections.append(externals / "mpi" / "00-gcc-packages.yaml")
elif self.spec.satisfies("compiler=intel"):
selections.append(externals / "mpi" / "01-intel-packages.yaml")

return selections

def rocm_compiler_cfg(self, rocm_version):
template = """\
compilers:
- compiler:
spec: rocmcc@{x}
paths:
cc: /opt/rocm-{x}/bin/amdclang
cxx: /opt/rocm-{x}/bin/amdclang++
f77: /opt/rocm-{x}/bin/amdflang
fc: /opt/rocm-{x}/bin/amdflang
flags:
cflags: -g -O2
cxxflags: -g -O2
operating_system: rhel8
target: x86_64
modules: []
environment:
prepend_path:
LIBRARY_PATH: /opt/rocm-{x}/lib
extra_rpaths:
- /opt/rocm-{x}/lib
"""
return template.format(x=rocm_version)

def compiler_configs(self):
compilers = LlnlCluster.resource_location / "compilers"

Expand All @@ -86,13 +209,29 @@ def compiler_configs(self):
elif self.spec.satisfies("compiler=intel"):
selections.append(compilers / "intel" / "00-intel-2021-6-0-compilers.yaml")

if self.spec.satisfies("cluster=corona"):
compiler_cfg_path = self.next_adhoc_cfg()
with open(compiler_cfg_path, "w") as f:
f.write(self.rocm_compiler_cfg(self.spec.variants["rocm"][0]))
selections.append(compiler_cfg_path)

return selections

def sw_description(self):
"""This is somewhat vestigial, and maybe deleted later. The experiments
will fail if these variables are not defined though, so for now
they are still generated (but with more-generic values).
"""
extra_software = ""
if self.spec.satisfies("cluster=corona"):
base_indent = " " * 2
package_indent = " " * 4
corona_extras = [
f"blas-rocm:\n{package_indent}{base_indent}pkg_spec: rocblas",
f"lapack-rocm:\n{package_indent}{base_indent}pkg_spec: rocsolver",
f"compiler-amdclang:\n{package_indent}{base_indent}pkg_spec: clang"
]
extra_software = f"\n{package_indent}".join(corona_extras)
return f"""\
software:
packages:
Expand All @@ -116,4 +255,5 @@ def sw_description(self):
pkg_spec: mvapich2
mpi-intel:
pkg_spec: mvapich2
{extra_software}
"""
Loading