Skip to content

Commit 62431f2

Browse files
benson31bvanessen
andauthored
scikit-build-core build system (#13)
* Add scikit-build-core+CMake build system * Remove debugging comment * Add missing file * Add RPATHs to python modules * Updates to FindNVSHMEM module * Automatic cuda arch detection * Add guards to ensure that benchmarks check to see if Torch distributed is initialized. Also change the path to the nvshmem p2p functions. * add torch_python to linkage * Add proper extension names * Add NVSHMEM's libdir to the r(un)path --------- Co-authored-by: Brian C. Van Essen <[email protected]>
1 parent b8a1ce6 commit 62431f2

File tree

9 files changed

+334
-182
lines changed

9 files changed

+334
-182
lines changed

CMakeLists.txt

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
cmake_minimum_required(VERSION 3.25.0)
2+
3+
# This is a bit messy, where it comes before project(), but it needs
4+
# to come before CUDA is enabled (I could also move that out of
5+
# project() into an explicit enable_language(), but this doesn't
6+
# require the CXX compiler, so it _can_ go here).
7+
if (NOT CMAKE_CUDA_ARCHITECTURES AND NOT TORCH_CUDA_ARCH_LIST)
8+
execute_process(
9+
COMMAND nvidia-smi --query-gpu "compute_cap" --format=csv,noheader
10+
RESULT_VARIABLE _nvidia_smi_retcode
11+
OUTPUT_VARIABLE _nvidia_smi_output
12+
ERROR_QUIET
13+
)
14+
15+
if (_nvidia_smi_retcode EQUAL 0 AND _nvidia_smi_output)
16+
string(REPLACE "\n" ";" _nvidia_compute_caps "${_nvidia_smi_output}")
17+
list(REMOVE_ITEM _nvidia_compute_caps "")
18+
list(SORT _nvidia_compute_caps COMPARE NATURAL)
19+
list(REMOVE_DUPLICATES _nvidia_compute_caps)
20+
string(REPLACE "." "" _nvidia_archs "${_nvidia_compute_caps}")
21+
22+
# TRB: I don't _think_ these need to go in the cache... But I'm
23+
# not opposed to it.
24+
set(CMAKE_CUDA_ARCHITECTURES ${_nvidia_archs})
25+
set(TORCH_CUDA_ARCH_LIST ${_nvidia_compute_caps})
26+
else ()
27+
set(CMAKE_CUDA_ARCHITECTURES 70 80 90)
28+
set(TORCH_CUDA_ARCH_LIST 7.0 8.0 9.0)
29+
endif ()
30+
elseif (NOT CMAKE_CUDA_ARCHITECTURES)
31+
string(REPLACE "." "" _nvidia_archs "${TORCH_CUDA_ARCH_LIST}")
32+
set(CMAKE_CUDA_ARCHITECTURES ${_nvidia_archs})
33+
elseif (NOT TORCH_CUDA_ARCH_LIST)
34+
# AFAIK, these are all of the form XY, so just put a '.' in there...
35+
list(TRANSFORM CMAKE_CUDA_ARCHITECTURES
36+
REPLACE "([0-9])([0-9])" "\\1.\\2"
37+
OUTPUT_VARIABLE TORCH_CUDA_ARCH_LIST
38+
)
39+
endif ()
40+
41+
message(STATUS "Detected CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
42+
message(STATUS "Detected TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}")
43+
44+
project(
45+
DGraph
46+
VERSION 0.0.1
47+
DESCRIPTION "A deep learning library for training graph neural networks at scale"
48+
HOMEPAGE_URL "https://github.com/LBANN/DGraph"
49+
LANGUAGES CXX CUDA
50+
)
51+
52+
option(DGRAPH_ENABLE_NVSHMEM
53+
"Use NVSHMEM in the build."
54+
ON
55+
)
56+
57+
# Dependencies
58+
list(APPEND
59+
CMAKE_MODULE_PATH
60+
"${CMAKE_CURRENT_SOURCE_DIR}/cmake"
61+
)
62+
63+
find_package(MPI 3.0 REQUIRED COMPONENTS CXX)
64+
find_package(Torch 2.6 REQUIRED CONFIG)
65+
66+
# Also, torch_python!
67+
# We also don't care about the limited API nonsense, so we can use
68+
# libtorch. Let's find it.
69+
if (TORCH_LIBRARY)
70+
get_filename_component(TORCH_LIB_DIR "${TORCH_LIBRARY}" DIRECTORY)
71+
endif ()
72+
find_library(TORCH_PYTHON_LIBRARY
73+
torch_python
74+
HINTS
75+
${TORCH_LIB_DIR}
76+
${Python_SITELIB}/torch/lib64
77+
${Python_SITELIB}/torch/lib
78+
NO_DEFAULT_PATH)
79+
find_library(TORCH_PYTHON_LIBRARY torch_python REQUIRED)
80+
81+
if (DGRAPH_ENABLE_NVSHMEM)
82+
find_package(NVSHMEM 2.5 REQUIRED MODULE)
83+
endif ()
84+
85+
find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
86+
find_package(pybind11 CONFIG REQUIRED)
87+
88+
python_add_library(torch_local
89+
MODULE
90+
WITH_SOABI
91+
DGraph/distributed/csrc/torch_local_bindings.cpp
92+
DGraph/distributed/csrc/torch_local_kernels.cu
93+
)
94+
95+
target_link_libraries(torch_local
96+
PUBLIC
97+
MPI::MPI_CXX
98+
torch
99+
${TORCH_PYTHON_LIBRARY}
100+
PRIVATE
101+
pybind11::headers
102+
)
103+
104+
target_sources(torch_local
105+
PUBLIC
106+
FILE_SET HEADERS
107+
BASE_DIRS DGraph/distributed/csrc DGraph/distributed/include
108+
FILES
109+
DGraph/distributed/include/macros.hpp
110+
DGraph/distributed/include/torch_local.hpp
111+
DGraph/distributed/csrc/local_data_kernels.cuh
112+
)
113+
114+
set_target_properties(torch_local
115+
PROPERTIES
116+
CXX_STANDARD 17
117+
CXX_STANDARD_REQUIRED ON
118+
CXX_EXTENSIONS OFF
119+
120+
CUDA_STANDARD 17
121+
CUDA_STANDARD_REQUIRED ON
122+
CUDA_EXTENSIONS OFF
123+
124+
INSTALL_RPATH_USE_LINK_PATH ON
125+
)
126+
127+
install(TARGETS torch_local
128+
LIBRARY DESTINATION .
129+
)
130+
131+
if (DGRAPH_ENABLE_NVSHMEM)
132+
python_add_library(torch_nvshmem_p2p
133+
MODULE
134+
WITH_SOABI
135+
DGraph/distributed/csrc/torch_nvshmem_p2p.cu
136+
DGraph/distributed/csrc/torch_nvshmem_p2p_bindings.cpp
137+
)
138+
139+
target_sources(torch_nvshmem_p2p
140+
PUBLIC
141+
FILE_SET HEADERS
142+
BASE_DIRS DGraph/distributed/csrc DGraph/distributed/include
143+
FILES
144+
DGraph/distributed/include/torch_nvshmem_p2p.hpp
145+
DGraph/distributed/csrc/local_data_kernels.cuh
146+
DGraph/distributed/csrc/nvshmem_comm_kernels.cuh
147+
)
148+
149+
target_link_libraries(torch_nvshmem_p2p
150+
PUBLIC
151+
NVSHMEM::NVSHMEM
152+
MPI::MPI_CXX
153+
torch
154+
${TORCH_PYTHON_LIBRARY}
155+
PRIVATE
156+
pybind11::headers
157+
)
158+
159+
get_filename_component(_nvshmem_lib_dir
160+
"${NVSHMEM_LIBRARY}"
161+
DIRECTORY
162+
)
163+
get_target_property(_nvshmem_install_rpath
164+
torch_nvshmem_p2p
165+
INSTALL_RPATH
166+
)
167+
if (_nvshmem_install_rpath)
168+
list(APPEND _nvshmem_install_rpath "${_nvshmem_lib_dir}")
169+
else ()
170+
set(_nvshmem_install_rpath "${_nvshmem_lib_dir}")
171+
endif ()
172+
173+
set_target_properties(torch_nvshmem_p2p
174+
PROPERTIES
175+
CXX_STANDARD 17
176+
CXX_STANDARD_REQUIRED ON
177+
CXX_EXTENSIONS OFF
178+
179+
CUDA_STANDARD 17
180+
CUDA_STANDARD_REQUIRED ON
181+
CUDA_EXTENSIONS OFF
182+
CUDA_SEPARABLE_COMPILATION ON
183+
184+
INSTALL_RPATH_USE_LINK_PATH ON
185+
INSTALL_RPATH "${_nvshmem_install_rpath}"
186+
)
187+
188+
install(TARGETS torch_nvshmem_p2p
189+
LIBRARY DESTINATION .
190+
)
191+
endif ()

DGraph/distributed/csrc/torch_local_bindings.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
#include <torch/extension.h>
1818
#include "torch_local.hpp"
1919

20-
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
20+
PYBIND11_MODULE(torch_local, m)
2121
{
2222
m.def("local_masked_gather", &local_masked_gather, "Masked Gather");
2323
m.def("local_masked_scatter", &local_masked_scatter, "Masked Scatter");
24-
}
24+
}

DGraph/distributed/csrc/torch_nvshmem_p2p_bindings.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <torch/extension.h>
1717
#include "torch_nvshmem_p2p.hpp"
1818

19-
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
19+
PYBIND11_MODULE(torch_nvshmem_p2p, m)
2020
{
2121
py::class_<NVSHMEMP2P>(m, "NVSHMEMP2P")
2222
.def(py::init<>())

DGraph/distributed/nvshmem/NVSHMEMBackendEngine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import torch
1515
import torch.distributed as dist
1616
from DGraph.distributed.Engine import BackendEngine
17-
import torch_nvshmem_p2p as nvshmem
17+
import DGraph.torch_nvshmem_p2p as nvshmem
1818
import warnings
1919
from torch.autograd import Function
2020

cmake/FindNVSHMEM.cmake

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
################################################################################
2+
## Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC.
3+
## Produced at the Lawrence Livermore National Laboratory.
4+
## Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5+
## the CONTRIBUTORS file. <[email protected]>
6+
##
7+
## LLNL-CODE-697807.
8+
## All rights reserved.
9+
##
10+
## This file is part of LBANN: Livermore Big Artificial Neural Network
11+
## Toolkit. For details, see http://software.llnl.gov/LBANN or
12+
## https://github.com/LLNL/LBANN.
13+
##
14+
## Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15+
## may not use this file except in compliance with the License. You may
16+
## obtain a copy of the License at:
17+
##
18+
## http://www.apache.org/licenses/LICENSE-2.0
19+
##
20+
## Unless required by applicable law or agreed to in writing, software
21+
## distributed under the License is distributed on an "AS IS" BASIS,
22+
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23+
## implied. See the License for the specific language governing
24+
## permissions and limitations under the license.
25+
################################################################################
26+
27+
# Output variables
28+
#
29+
# NVSHMEM_FOUND
30+
# NVSHMEM_LIBRARY
31+
# NVSHMEM_INCLUDE_DIR
32+
#
33+
# Also creates an imported target NVSHMEM::NVSHMEM
34+
35+
# Find the library
36+
find_library(NVSHMEM_LIBRARY nvshmem
37+
HINTS ${NVSHMEM_HOME} $ENV{NVSHMEM_HOME}
38+
${NVSHMEM_PREFIX} $ENV{NVSHMEM_PREFIX}
39+
PATH_SUFFIXES lib lib64
40+
NO_DEFAULT_PATH
41+
DOC "The location of NVSHMEM library.")
42+
find_library(NVSHMEM_LIBRARY nvshmem)
43+
44+
# Find the header
45+
find_path(NVSHMEM_INCLUDE_DIRS nvshmem.h
46+
HINTS ${NVSHMEM_HOME} $ENV{NVSHMEM_HOME}
47+
${NVSHMEM_PREFIX} $ENV{NVSHMEM_PREFIX}
48+
PATH_SUFFIXES include
49+
NO_DEFAULT_PATH
50+
DOC "The location of NVSHMEM headers.")
51+
find_path(NVSHMEM_INCLUDE_DIRS nvshmemx.h)
52+
53+
# Handle the find_package arguments
54+
include(FindPackageHandleStandardArgs)
55+
find_package_handle_standard_args(
56+
NVSHMEM DEFAULT_MSG NVSHMEM_LIBRARY NVSHMEM_INCLUDE_DIRS)
57+
58+
# Build the imported target
59+
if (NOT TARGET NVSHMEM::NVSHMEM)
60+
add_library(NVSHMEM::NVSHMEM INTERFACE IMPORTED)
61+
endif ()
62+
63+
target_link_libraries(NVSHMEM::NVSHMEM INTERFACE "${NVSHMEM_LIBRARY}")
64+
target_include_directories(NVSHMEM::NVSHMEM INTERFACE "${NVSHMEM_INCLUDE_DIRS}")
65+
target_compile_definitions(NVSHMEM::NVSHMEM INTERFACE "-DNVSHMEM_TARGET")

experiments/Benchmarks/TestNCCL.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ def main():
180180
num_iters = args.num_iters
181181
log_dir = args.log_dir
182182

183-
dist.init_process_group(backend="nccl")
183+
if not dist.is_initialized():
184+
dist.init_process_group(backend="nccl")
184185
comm = Communicator.init_process_group("nccl")
185186

186187
world_size = dist.get_world_size()

experiments/Benchmarks/TestNVSHMEM.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -141,12 +141,13 @@ def main():
141141
rank = comm.get_rank()
142142
world_size = comm.get_world_size()
143143

144-
dist.init_process_group(
145-
backend="nccl",
146-
rank=rank,
147-
world_size=world_size,
148-
init_method=f"file://{os.getcwd()}/DGraph_tmpfile",
149-
)
144+
if not dist.is_initialized():
145+
dist.init_process_group(
146+
backend="nccl",
147+
rank=rank,
148+
world_size=world_size,
149+
init_method=f"file://{os.getcwd()}/DGraph_tmpfile",
150+
)
150151

151152
safe_create_dir(log_dir, rank)
152153

pyproject.toml

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,66 @@
11
[build-system]
2-
requires = ["setuptools", "wheel", "torch"] # Add other build dependencies here
3-
build-backend = "setuptools.build_meta"
2+
requires = [
3+
"scikit-build-core>=0.10",
4+
"pybind11"
5+
]
6+
build-backend = "scikit_build_core.build"
7+
8+
[project]
9+
name = "DGraph"
10+
version = "0.0.1"
11+
description = "A deep learning library for training graph neural networks at scale"
12+
authors = [
13+
{ name = "Shehtab Zaman", email = "[email protected]" },
14+
{ name = "Tal Ben Nun", email = "[email protected]" },
15+
{ name = "Tom Benson", email = "[email protected]" },
16+
{ name = "Pier Fiedorowicz", email = "[email protected]" },
17+
{ name = "Brian Van Essen", email = "[email protected]" },
18+
]
19+
license = { file = "LICENSE" }
20+
readme = "README.md"
21+
requires-python = ">=3.9"
22+
classifiers = [
23+
"Development Status :: 4 - Beta",
24+
25+
"License :: OSI Approved :: Apache Software License",
26+
27+
"Programming Language :: C++",
28+
29+
"Programming Language :: Python :: 3",
30+
"Programming Language :: Python :: 3.9",
31+
"Programming Language :: Python :: 3.10",
32+
"Programming Language :: Python :: 3.11",
33+
"Programming Language :: Python :: 3.12",
34+
"Programming Language :: Python :: 3.13",
35+
36+
"Environment :: GPU",
37+
"Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.2",
38+
39+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
40+
"Topic :: System :: Distributed Computing",
41+
42+
"Private :: Do Not Upload"
43+
]
44+
# dependencies=[
45+
# "torch",
46+
# "numpy",
47+
# "mpi4py>=3.1.4",
48+
# ]
49+
50+
[project.optional-dependencies]
51+
test = ["pytest"]
52+
53+
[tool.scikit-build]
54+
minimum-version = "build-system.requires"
55+
build-dir = "skbuild"
56+
57+
[tool.scikit-build.cmake]
58+
version = ">=3.25.0"
59+
60+
[tool.scikit-build.ninja]
61+
version = ">=1.11"
62+
make-fallback = false
63+
64+
[tool.scikit-build.wheel]
65+
expand-macos-universal-tags = true
66+
install-dir = "DGraph"

0 commit comments

Comments
 (0)