Skip to content

Commit b8ffecc

Browse files
authored
0.4.0 release (#27)
* Run clang-format. * Remove extraneous TODO. * Update binding utility scripts. * Bump version defines. * Update version info in docs. * Resolve compilation warnings in autotune.cc with conditional NCCL paths. * Update integer to logical command-line argument handling in Fortran tests. * Add CMake build recommendation to README.md.
1 parent 6e17527 commit b8ffecc

File tree

16 files changed

+127
-111
lines changed

16 files changed

+127
-111
lines changed

CMakeLists.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ if (CRAY_CC_BIN)
7272
message(STATUS "Found GDRCopy library: " ${GDRCOPY_LIBRARY})
7373
endif()
7474

75-
# TODO: Check for MPICH to define `-DMPICH` flag
76-
7775
# HPC SDK
7876
find_package(NVHPC REQUIRED COMPONENTS CUDA MATH)
7977

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Please contact us or open a GitHub issue if you are interested in using this lib
1414

1515
## Build
1616

17-
### Method 1: Makefile with Configuration file
17+
### Method 1: Makefile with Configuration file (deprecated)
1818
To build the library, you must first create a configuration file to point the installed to dependent library paths and enable/disable features.
1919
See the default [`nvhpcsdk.conf`](configs/nvhpcsdk.conf) for an example of settings to build the library using the [NVHPC SDK compilers and libraries](https://developer.nvidia.com/hpc-sdk).
2020
The [`configs/`](configs) directory also contains several sample build configuration files for a number of GPU compute clusters, like Perlmutter, Summit, and Marconi 100.
@@ -25,9 +25,9 @@ With this configuration file created, you can build the library using the comman
2525
$ make -j CONFIGFILE=<path to your configuration file>
2626
```
2727

28-
The library will be compiled and installed in a newly created `build/` directory.
28+
The library will be compiled and installed in a newly created `build/` directory. This build method is deprecated and will be removed in a future release.
2929

30-
### Method 2: CMake
30+
### Method 2: CMake (recommended)
3131
We also enable builds using CMake. A CMake build of the library without additional examples/tests can be completed using the following commands
3232
```shell
3333
$ mkdir build

benchmark/benchmark.cu

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,9 +220,7 @@ int main(int argc, char** argv) {
220220
cudecompGridDescAutotuneOptions_t options;
221221
CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(&options));
222222
options.dtype = get_cudecomp_datatype(complex_t(0));
223-
for (int i = 0; i < 4; ++i) {
224-
options.transpose_use_inplace_buffers[i] = !out_of_place;
225-
}
223+
for (int i = 0; i < 4; ++i) { options.transpose_use_inplace_buffers[i] = !out_of_place; }
226224

227225
if (comm_backend != 0) {
228226
config.transpose_comm_backend = comm_backend;

docs/conf.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
author = 'NVIDIA Corporation'
2525

2626
# The full version, including alpha/beta/rc tags
27-
release = '2022'
27+
version = '0.4.0'
28+
release = version
2829

2930

3031
# -- General configuration ---------------------------------------------------

include/cudecomp.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
#include <mpi.h>
4444

4545
#define CUDECOMP_MAJOR 0
46-
#define CUDECOMP_MINOR 3
47-
#define CUDECOMP_PATCH 1
46+
#define CUDECOMP_MINOR 4
47+
#define CUDECOMP_PATCH 0
4848

4949
/**
5050
* @brief This enum lists the different available transpose backend options.
@@ -140,10 +140,10 @@ typedef struct {
140140
*/
141141
typedef struct {
142142
// General options
143-
int32_t n_warmup_trials; ///< number of warmup trials to run for each tested configuration during autotuning
144-
///< (default: 3)
145-
int32_t n_trials; ///< number of timed trials to run for each tested configuration during autotuning
146-
///< (default: 5)
143+
int32_t n_warmup_trials; ///< number of warmup trials to run for each tested configuration during autotuning
144+
///< (default: 3)
145+
int32_t n_trials; ///< number of timed trials to run for each tested configuration during autotuning
146+
///< (default: 5)
147147
cudecompAutotuneGridMode_t grid_mode; ///< which communication (transpose/halo) to use to autotune process grid
148148
///< (default: CUDECOMP_AUTOTUNE_GRID_TRANSPOSE)
149149
cudecompDataType_t dtype; ///< datatype to use during autotuning (default: CUDECOMP_DOUBLE)
@@ -152,19 +152,19 @@ typedef struct {
152152
bool disable_nccl_backends; ///< flag to disable NCCL backend options during autotuning (default: false)
153153
bool disable_nvshmem_backends; ///< flag to disable NVSHMEM backend options during autotuning (default: false)
154154
double skip_threshold; ///< threshold used to skip testing slow configurations; skip configuration
155-
///< if `skip_threshold * t > t_best`, where `t` is the duration of the first timed trial
156-
///< for the configuration and `t_best` is the average trial time of the current best
157-
///< configuration (default: 0.0)
155+
///< if `skip_threshold * t > t_best`, where `t` is the duration of the first timed trial
156+
///< for the configuration and `t_best` is the average trial time of the current best
157+
///< configuration (default: 0.0)
158158

159159
// Transpose-specific options
160160
bool autotune_transpose_backend; ///< flag to enable transpose backend autotuning (default: false)
161161
bool transpose_use_inplace_buffers[4]; ///< flag to control whether transpose autotuning uses in-place or out-of-place
162162
///< buffers during autotuning by transpose operation, considering
163163
///< the following order: X-to-Y, Y-to-Z, Z-to-Y, Y-to-X
164164
///< (default: [false, false, false, false])
165-
double transpose_op_weights[4]; ///< multiplicative weight to apply to trial time contribution by transpose operation
166-
///< in the following order: X-to-Y, Y-to-Z, Z-to-Y, Y-to-X
167-
///< (default: [1.0, 1.0, 1.0, 1.0])
165+
double transpose_op_weights[4]; ///< multiplicative weight to apply to trial time contribution by transpose operation
166+
///< in the following order: X-to-Y, Y-to-Z, Z-to-Y, Y-to-X
167+
///< (default: [1.0, 1.0, 1.0, 1.0])
168168

169169
// Halo-specific options
170170
bool autotune_halo_backend; ///< flag to enable halo backend autotuning (default: false)

include/internal/comm_routines.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,10 @@ nvshmemAlltoallV(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_
8686
size_t send_bytes = send_counts[dst_rank] * sizeof(T);
8787
size_t nchunks = (send_bytes + CUDECOMP_NVSHMEM_CHUNK_SZ - 1) / CUDECOMP_NVSHMEM_CHUNK_SZ;
8888
for (size_t j = 0; j < nchunks; ++j) {
89-
nvshmemx_putmem_nbi_on_stream(
90-
recv_buff + recv_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
91-
send_buff + send_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
92-
std::min(CUDECOMP_NVSHMEM_CHUNK_SZ, send_bytes - j * CUDECOMP_NVSHMEM_CHUNK_SZ),
93-
dst_rank_global, stream);
89+
nvshmemx_putmem_nbi_on_stream(recv_buff + recv_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
90+
send_buff + send_offsets[dst_rank] + j * (CUDECOMP_NVSHMEM_CHUNK_SZ / sizeof(T)),
91+
std::min(CUDECOMP_NVSHMEM_CHUNK_SZ, send_bytes - j * CUDECOMP_NVSHMEM_CHUNK_SZ),
92+
dst_rank_global, stream);
9493
}
9594
continue;
9695
}

include/internal/common.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,14 @@ struct cudecompHandle {
6161
ncclComm_t nccl_comm = nullptr; // NCCL communicator (global)
6262
ncclComm_t nccl_local_comm = nullptr; // NCCL communicator (intranode)
6363
bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage
64-
std::unordered_map<void*, std::vector<std::pair<ncclComm_t, void*>>> nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s)
64+
std::unordered_map<void*, std::vector<std::pair<ncclComm_t, void*>>>
65+
nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s)
6566

6667
cudaStream_t pl_stream = nullptr; // stream used for pipelined backends
6768

6869
cutensorHandle_t cutensor_handle; // cuTENSOR handle;
6970
#if CUTENSOR_MAJOR >= 2
70-
cutensorPlanPreference_t cutensor_plan_pref; // cuTENSOR plan preference;
71+
cutensorPlanPreference_t cutensor_plan_pref; // cuTENSOR plan preference;
7172
#endif
7273

7374
std::vector<std::array<char, MPI_MAX_PROCESSOR_NAME>> hostnames; // list of hostnames by rank

include/internal/transpose.h

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -77,18 +77,15 @@ template <typename T> static inline cutensorDataType_t getCutensorDataType() { r
7777

7878
static inline cutensorComputeDescriptor_t getCutensorComputeType(cutensorDataType_t cutensor_dtype) {
7979
switch (cutensor_dtype) {
80-
case CUTENSOR_R_32F:
81-
case CUTENSOR_C_32F:
82-
return CUTENSOR_COMPUTE_DESC_32F;
83-
case CUTENSOR_R_64F:
84-
case CUTENSOR_C_64F:
85-
default:
86-
return CUTENSOR_COMPUTE_DESC_64F;
80+
case CUTENSOR_R_32F:
81+
case CUTENSOR_C_32F: return CUTENSOR_COMPUTE_DESC_32F;
82+
case CUTENSOR_R_64F:
83+
case CUTENSOR_C_64F:
84+
default: return CUTENSOR_COMPUTE_DESC_64F;
8785
}
8886
}
8987

90-
template <typename T>
91-
static inline uint32_t getAlignment(const T* ptr) {
88+
template <typename T> static inline uint32_t getAlignment(const T* ptr) {
9289
auto i_ptr = reinterpret_cast<std::uintptr_t>(ptr);
9390
for (uint32_t d = 16; d > 0; d >>= 1) {
9491
if (i_ptr % d == 0) return d;
@@ -116,14 +113,15 @@ static void localPermute(const cudecompHandle_t handle, const std::array<int64_t
116113
CHECK_CUTENSOR(cutensorCreateTensorDescriptor(handle->cutensor_handle, &desc_in, 3, extent_in.data(), strides_in_ptr,
117114
cutensor_type, getAlignment(input)));
118115
cutensorTensorDescriptor_t desc_out;
119-
CHECK_CUTENSOR(cutensorCreateTensorDescriptor(handle->cutensor_handle, &desc_out, 3, extent_out.data(), strides_out_ptr,
120-
cutensor_type, getAlignment(output)));
116+
CHECK_CUTENSOR(cutensorCreateTensorDescriptor(handle->cutensor_handle, &desc_out, 3, extent_out.data(),
117+
strides_out_ptr, cutensor_type, getAlignment(output)));
121118

122119
cutensorOperationDescriptor_t desc_op;
123-
CHECK_CUTENSOR(cutensorCreatePermutation(handle->cutensor_handle, &desc_op, desc_in, order_in.data(), CUTENSOR_OP_IDENTITY,
124-
desc_out, order_out.data(), getCutensorComputeType(cutensor_type)));
120+
CHECK_CUTENSOR(cutensorCreatePermutation(handle->cutensor_handle, &desc_op, desc_in, order_in.data(),
121+
CUTENSOR_OP_IDENTITY, desc_out, order_out.data(),
122+
getCutensorComputeType(cutensor_type)));
125123

126-
cutensorPlan_t plan;
124+
cutensorPlan_t plan;
127125
CHECK_CUTENSOR(cutensorCreatePlan(handle->cutensor_handle, &plan, desc_op, handle->cutensor_plan_pref, 0));
128126

129127
T one(1);

0 commit comments

Comments
 (0)