scikit-build-core build system (#13)

benson31 · bvanessen · web-flow · commit 62431f206f62 · 2025-04-25T19:00:29.000-04:00
* Add scikit-build-core+CMake build system

* Remove debugging comment

* Add missing file

* Add RPATHs to python modules

* Updates to FindNVSHMEM module

* Automatic cuda arch detection

* Add guards to ensure that benchmarks check to see if Torch distributed
is initialized. Also change the path to the nvshmem p2p functions.

* add torch_python to linkage

* Add proper extension names

* Add NVSHMEM's libdir to the r(un)path

---------

Co-authored-by: Brian C. Van Essen &lt;vanessen1@llnl.gov&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,191 @@
+cmake_minimum_required(VERSION 3.25.0)
+
+# This is a bit messy, where it comes before project(), but it needs
+# to come before CUDA is enabled (I could also move that out of
+# project() into an explicit enable_language(), but this doesn't
+# require the CXX compiler, so it _can_ go here).
+if (NOT CMAKE_CUDA_ARCHITECTURES AND NOT TORCH_CUDA_ARCH_LIST)
+  execute_process(
+    COMMAND nvidia-smi --query-gpu "compute_cap" --format=csv,noheader
+    RESULT_VARIABLE _nvidia_smi_retcode
+    OUTPUT_VARIABLE _nvidia_smi_output
+    ERROR_QUIET
+  )
+
+  if (_nvidia_smi_retcode EQUAL 0 AND _nvidia_smi_output)
+    string(REPLACE "\n" ";" _nvidia_compute_caps "${_nvidia_smi_output}")
+    list(REMOVE_ITEM _nvidia_compute_caps "")
+    list(SORT _nvidia_compute_caps COMPARE NATURAL)
+    list(REMOVE_DUPLICATES _nvidia_compute_caps)
+    string(REPLACE "." "" _nvidia_archs "${_nvidia_compute_caps}")
+
+    # TRB: I don't _think_ these need to go in the cache... But I'm
+    # not opposed to it.
+    set(CMAKE_CUDA_ARCHITECTURES ${_nvidia_archs})
+    set(TORCH_CUDA_ARCH_LIST ${_nvidia_compute_caps})
+  else ()
+    set(CMAKE_CUDA_ARCHITECTURES 70 80 90)
+    set(TORCH_CUDA_ARCH_LIST 7.0 8.0 9.0)
+  endif ()
+elseif (NOT CMAKE_CUDA_ARCHITECTURES)
+  string(REPLACE "." "" _nvidia_archs "${TORCH_CUDA_ARCH_LIST}")
+  set(CMAKE_CUDA_ARCHITECTURES ${_nvidia_archs})
+elseif (NOT TORCH_CUDA_ARCH_LIST)
+  # AFAIK, these are all of the form XY, so just put a '.' in there...
+  list(TRANSFORM CMAKE_CUDA_ARCHITECTURES
+    REPLACE "([0-9])([0-9])" "\\1.\\2"
+    OUTPUT_VARIABLE TORCH_CUDA_ARCH_LIST
+  )
+endif ()
+
+message(STATUS "Detected CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+message(STATUS "Detected TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}")
+
+project(
+  DGraph
+  VERSION 0.0.1
+  DESCRIPTION "A deep learning library for training graph neural networks at scale"
+  HOMEPAGE_URL "https://github.com/LBANN/DGraph"
+  LANGUAGES CXX CUDA
+)
+
+option(DGRAPH_ENABLE_NVSHMEM
+  "Use NVSHMEM in the build."
+  ON
+)
+
+# Dependencies
+list(APPEND
+  CMAKE_MODULE_PATH
+  "${CMAKE_CURRENT_SOURCE_DIR}/cmake"
+)
+
+find_package(MPI 3.0 REQUIRED COMPONENTS CXX)
+find_package(Torch 2.6 REQUIRED CONFIG)
+
+# Also, torch_python!
+# We also don't care about the limited API nonsense, so we can use
+# libtorch. Let's find it.
+if (TORCH_LIBRARY)
+  get_filename_component(TORCH_LIB_DIR "${TORCH_LIBRARY}" DIRECTORY)
+endif ()
+find_library(TORCH_PYTHON_LIBRARY
+  torch_python
+  HINTS
+  ${TORCH_LIB_DIR}
+  ${Python_SITELIB}/torch/lib64
+  ${Python_SITELIB}/torch/lib
+  NO_DEFAULT_PATH)
+find_library(TORCH_PYTHON_LIBRARY torch_python REQUIRED)
+
+if (DGRAPH_ENABLE_NVSHMEM)
+  find_package(NVSHMEM 2.5 REQUIRED MODULE)
+endif ()
+
+find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
+find_package(pybind11 CONFIG REQUIRED)
+
+python_add_library(torch_local
+  MODULE
+  WITH_SOABI
+  DGraph/distributed/csrc/torch_local_bindings.cpp
+  DGraph/distributed/csrc/torch_local_kernels.cu
+)
+
+target_link_libraries(torch_local
+  PUBLIC
+  MPI::MPI_CXX
+  torch
+  ${TORCH_PYTHON_LIBRARY}
+  PRIVATE
+  pybind11::headers
+)
+
+target_sources(torch_local
+  PUBLIC
+  FILE_SET HEADERS
+  BASE_DIRS DGraph/distributed/csrc DGraph/distributed/include
+  FILES
+  DGraph/distributed/include/macros.hpp
+  DGraph/distributed/include/torch_local.hpp
+  DGraph/distributed/csrc/local_data_kernels.cuh
+)
+
+set_target_properties(torch_local
+  PROPERTIES
+  CXX_STANDARD 17
+  CXX_STANDARD_REQUIRED ON
+  CXX_EXTENSIONS OFF
+
+  CUDA_STANDARD 17
+  CUDA_STANDARD_REQUIRED ON
+  CUDA_EXTENSIONS OFF
+
+  INSTALL_RPATH_USE_LINK_PATH ON
+)
+
+install(TARGETS torch_local
+  LIBRARY DESTINATION .
+)
+
+if (DGRAPH_ENABLE_NVSHMEM)
+  python_add_library(torch_nvshmem_p2p
+    MODULE
+    WITH_SOABI
+    DGraph/distributed/csrc/torch_nvshmem_p2p.cu
+    DGraph/distributed/csrc/torch_nvshmem_p2p_bindings.cpp
+  )
+
+  target_sources(torch_nvshmem_p2p
+    PUBLIC
+    FILE_SET HEADERS
+    BASE_DIRS DGraph/distributed/csrc DGraph/distributed/include
+    FILES
+    DGraph/distributed/include/torch_nvshmem_p2p.hpp
+    DGraph/distributed/csrc/local_data_kernels.cuh
+    DGraph/distributed/csrc/nvshmem_comm_kernels.cuh
+  )
+
+  target_link_libraries(torch_nvshmem_p2p
+    PUBLIC
+    NVSHMEM::NVSHMEM
+    MPI::MPI_CXX
+    torch
+    ${TORCH_PYTHON_LIBRARY}
+    PRIVATE
+    pybind11::headers
+  )
+
+  get_filename_component(_nvshmem_lib_dir
+    "${NVSHMEM_LIBRARY}"
+    DIRECTORY
+  )
+  get_target_property(_nvshmem_install_rpath
+    torch_nvshmem_p2p
+    INSTALL_RPATH
+  )
+  if (_nvshmem_install_rpath)
+    list(APPEND _nvshmem_install_rpath "${_nvshmem_lib_dir}")
+  else ()
+    set(_nvshmem_install_rpath "${_nvshmem_lib_dir}")
+  endif ()
+
+  set_target_properties(torch_nvshmem_p2p
+    PROPERTIES
+    CXX_STANDARD 17
+    CXX_STANDARD_REQUIRED ON
+    CXX_EXTENSIONS OFF
+
+    CUDA_STANDARD 17
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_EXTENSIONS OFF
+    CUDA_SEPARABLE_COMPILATION ON
+
+    INSTALL_RPATH_USE_LINK_PATH ON
+    INSTALL_RPATH "${_nvshmem_install_rpath}"
+  )
+
+  install(TARGETS torch_nvshmem_p2p
+    LIBRARY DESTINATION .
+  )
+endif ()
diff --git a/DGraph/distributed/csrc/torch_local_bindings.cpp b/DGraph/distributed/csrc/torch_local_bindings.cpp
@@ -17,8 +17,8 @@
 #include <torch/extension.h>
 #include "torch_local.hpp"
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+PYBIND11_MODULE(torch_local, m)
 {
   m.def("local_masked_gather", &local_masked_gather, "Masked Gather");
   m.def("local_masked_scatter", &local_masked_scatter, "Masked Scatter");
-}
+}
diff --git a/DGraph/distributed/csrc/torch_nvshmem_p2p_bindings.cpp b/DGraph/distributed/csrc/torch_nvshmem_p2p_bindings.cpp
@@ -16,7 +16,7 @@
 #include <torch/extension.h>
 #include "torch_nvshmem_p2p.hpp"
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+PYBIND11_MODULE(torch_nvshmem_p2p, m)
 {
   py::class_<NVSHMEMP2P>(m, "NVSHMEMP2P")
       .def(py::init<>())
diff --git a/DGraph/distributed/nvshmem/NVSHMEMBackendEngine.py b/DGraph/distributed/nvshmem/NVSHMEMBackendEngine.py
@@ -14,7 +14,7 @@
 import torch
 import torch.distributed as dist
 from DGraph.distributed.Engine import BackendEngine
-import torch_nvshmem_p2p as nvshmem
+import DGraph.torch_nvshmem_p2p as nvshmem
 import warnings
 from torch.autograd import Function
 
diff --git a/cmake/FindNVSHMEM.cmake b/cmake/FindNVSHMEM.cmake
@@ -0,0 +1,65 @@
+################################################################################
+## Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC.
+## Produced at the Lawrence Livermore National Laboratory.
+## Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+## the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+##
+## LLNL-CODE-697807.
+## All rights reserved.
+##
+## This file is part of LBANN: Livermore Big Artificial Neural Network
+## Toolkit. For details, see http://software.llnl.gov/LBANN or
+## https://github.com/LLNL/LBANN.
+##
+## Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+## may not use this file except in compliance with the License.  You may
+## obtain a copy of the License at:
+##
+## http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+## implied. See the License for the specific language governing
+## permissions and limitations under the license.
+################################################################################
+
+# Output variables
+#
+#   NVSHMEM_FOUND
+#   NVSHMEM_LIBRARY
+#   NVSHMEM_INCLUDE_DIR
+#
+# Also creates an imported target NVSHMEM::NVSHMEM
+
+# Find the library
+find_library(NVSHMEM_LIBRARY nvshmem
+  HINTS ${NVSHMEM_HOME} $ENV{NVSHMEM_HOME}
+  ${NVSHMEM_PREFIX} $ENV{NVSHMEM_PREFIX}
+  PATH_SUFFIXES lib lib64
+  NO_DEFAULT_PATH
+  DOC "The location of NVSHMEM library.")
+find_library(NVSHMEM_LIBRARY nvshmem)
+
+# Find the header
+find_path(NVSHMEM_INCLUDE_DIRS nvshmem.h
+  HINTS ${NVSHMEM_HOME} $ENV{NVSHMEM_HOME}
+  ${NVSHMEM_PREFIX} $ENV{NVSHMEM_PREFIX}
+  PATH_SUFFIXES include
+  NO_DEFAULT_PATH
+  DOC "The location of NVSHMEM headers.")
+find_path(NVSHMEM_INCLUDE_DIRS nvshmemx.h)
+
+# Handle the find_package arguments
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  NVSHMEM DEFAULT_MSG NVSHMEM_LIBRARY NVSHMEM_INCLUDE_DIRS)
+
+# Build the imported target
+if (NOT TARGET NVSHMEM::NVSHMEM)
+  add_library(NVSHMEM::NVSHMEM INTERFACE IMPORTED)
+endif ()
+
+target_link_libraries(NVSHMEM::NVSHMEM INTERFACE "${NVSHMEM_LIBRARY}")
+target_include_directories(NVSHMEM::NVSHMEM INTERFACE "${NVSHMEM_INCLUDE_DIRS}")
+target_compile_definitions(NVSHMEM::NVSHMEM INTERFACE "-DNVSHMEM_TARGET")
diff --git a/experiments/Benchmarks/TestNCCL.py b/experiments/Benchmarks/TestNCCL.py
@@ -180,7 +180,8 @@ def main():
     num_iters = args.num_iters
     log_dir = args.log_dir
 
-    dist.init_process_group(backend="nccl")
+    if not dist.is_initialized():
+        dist.init_process_group(backend="nccl")
     comm = Communicator.init_process_group("nccl")
 
     world_size = dist.get_world_size()
diff --git a/experiments/Benchmarks/TestNVSHMEM.py b/experiments/Benchmarks/TestNVSHMEM.py
@@ -141,12 +141,13 @@ def main():
     rank = comm.get_rank()
     world_size = comm.get_world_size()
 
-    dist.init_process_group(
-        backend="nccl",
-        rank=rank,
-        world_size=world_size,
-        init_method=f"file://{os.getcwd()}/DGraph_tmpfile",
-    )
+    if not dist.is_initialized():
+        dist.init_process_group(
+            backend="nccl",
+            rank=rank,
+            world_size=world_size,
+            init_method=f"file://{os.getcwd()}/DGraph_tmpfile",
+        )
 
     safe_create_dir(log_dir, rank)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,66 @@
 [build-system]
-requires = ["setuptools", "wheel", "torch"] # Add other build dependencies here
-build-backend = "setuptools.build_meta"
+requires = [
+  "scikit-build-core>=0.10",
+  "pybind11"
+]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "DGraph"
+version = "0.0.1"
+description = "A deep learning library for training graph neural networks at scale"
+authors = [
+  { name = "Shehtab Zaman", email = "zaman1@llnl.gov" },
+  { name = "Tal Ben Nun", email = "bennun2@llnl.gov" },
+  { name = "Tom Benson", email = "benson31@llnl.gov" },
+  { name = "Pier Fiedorowicz", email = "fiedorowicz1@llnl.gov" },
+  { name = "Brian Van Essen", email = "vanessen1@llnl.gov" },
+]
+license = { file = "LICENSE" }
+readme = "README.md"
+requires-python = ">=3.9"
+classifiers = [
+  "Development Status :: 4 - Beta",
+
+  "License :: OSI Approved :: Apache Software License",
+
+  "Programming Language :: C++",
+
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+
+  "Environment :: GPU",
+  "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.2",
+
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: System :: Distributed Computing",
+
+  "Private :: Do Not Upload"
+]
+# dependencies=[
+#   "torch",
+#   "numpy",
+#   "mpi4py>=3.1.4",
+# ]
+
+[project.optional-dependencies]
+test = ["pytest"]
+
+[tool.scikit-build]
+minimum-version = "build-system.requires"
+build-dir = "skbuild"
+
+[tool.scikit-build.cmake]
+version = ">=3.25.0"
+
+[tool.scikit-build.ninja]
+version = ">=1.11"
+make-fallback = false
+
+[tool.scikit-build.wheel]
+expand-macos-universal-tags = true
+install-dir = "DGraph"
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@`
`16`	`16`	`#include <torch/extension.h>`
`17`	`17`	`#include "torch_nvshmem_p2p.hpp"`
`18`	`18`
`19`		`-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)`
	`19`	`+PYBIND11_MODULE(torch_nvshmem_p2p, m)`
`20`	`20`	`{`
`21`	`21`	`py::class_<NVSHMEMP2P>(m, "NVSHMEMP2P")`
`22`	`22`	`.def(py::init<>())`