Skip to content

Commit

Permalink
Add bisection nccl test
Browse files Browse the repository at this point in the history
Add bisection to makefile

Add bisection doc in performance.md
  • Loading branch information
x41lakazam committed Apr 14, 2024
1 parent c6afef0 commit 52ebfc6
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 1 deletion.
11 changes: 11 additions & 0 deletions doc/PERFORMANCE.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@ Similarly to broadcast, all data need to be sent to the root, hence :
And :

`B = S/t`

### Bisection

In the bisection operation, each rank is paired with a single other rank and sends him one message of size S, hence:

t = S/B

And :

B = S/t

### Summary

Expand All @@ -140,5 +150,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_
* AllGather : (_n_-1)/_n_
* Broadcast : 1
* Reduce : 1
* Bisection : 1

The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
2 changes: 1 addition & 1 deletion src/Makefile
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
DST_DIR := $(BUILDDIR)
SRC_FILES := $(wildcard *.cu)
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube bisection
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)

build: ${BIN_FILES}
Expand Down
121 changes: 121 additions & 0 deletions src/bisection.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

#include "cuda_runtime.h"
#include "common.h"

void BisectionGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = count;
}

int getPeer(int rank, int n_ranks){
if (n_ranks % 4 == 0)
return ((n_ranks / 2 + rank) % n_ranks) + (rank % 2 ? -1 : 1);
// If there is an odd number of ranks, the last rank is ignored and paired with itself
else if (n_ranks % 2 == 1 && rank == n_ranks-1)
return rank;
else
return (rank + n_ranks/2) % (n_ranks - n_ranks % 2);
}

testResult_t BisectionInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;

for (int i=0; i<args->nGpus; i++) {
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
int peer = getPeer(rank, nranks);
TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
CUDACHECK(cudaDeviceSynchronize());
}
// We don't support in-place sendrecv
args->reportErrors = in_place ? 0 : 1;
return testSuccess;
}

void BisectionGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
*busBw = *algBw = (double)(count * typesize) / 1.0E9 / sec;
}

testResult_t BisectionRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
int n_ranks, comm_rank, peer;

NCCLCHECK(ncclCommUserRank(comm, &comm_rank));
NCCLCHECK(ncclCommCount(comm, &n_ranks));

peer = getPeer(comm_rank, n_ranks);

NCCLCHECK(ncclGroupStart());
NCCLCHECK(ncclSend(sendbuff, count, type, peer, comm, stream));
NCCLCHECK(ncclRecv(recvbuff, count, type, peer, comm, stream));
NCCLCHECK(ncclGroupEnd());

return testSuccess;
}

struct testColl bisectionTest = {
"Bisection",
BisectionGetCollByteCount,
BisectionInitData,
BisectionGetBw,
BisectionRunColl
};

void BisectionGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
BisectionGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}

testResult_t BisectionRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &bisectionTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;

if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}

if ((int)op != -1) {
op_count = 1;
run_ops = &op;
run_opnames = &opName;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}

for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
}
}
return testSuccess;
}

struct testEngine bisectionEngine = {
BisectionGetBuffSize,
BisectionRunTest
};

#pragma weak ncclTestEngine=bisectionEngine

0 comments on commit 52ebfc6

Please sign in to comment.