Skip to content

Commit

Permalink
Support setting CUDA_VISIBLE_DEVICES env variable
Browse files Browse the repository at this point in the history
  • Loading branch information
Greg Inozemtsev authored and ryanamazon committed Aug 19, 2022
1 parent 8274cb4 commit 1c00538
Show file tree
Hide file tree
Showing 11 changed files with 43 additions and 16 deletions.
4 changes: 3 additions & 1 deletion src/all_gather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
Expand Down
4 changes: 3 additions & 1 deletion src/all_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
Expand Down
4 changes: 3 additions & 1 deletion src/alltoall.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
char* str = getenv("NCCL_TESTS_DEVICE");
int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
Expand Down
4 changes: 3 additions & 1 deletion src/broadcast.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
Expand Down
19 changes: 13 additions & 6 deletions src/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ thread_local int is_main_thread = 0;
// Command line parameter defaults
static int nThreads = 1;
static int nGpus = 1;
static int nGpusVisible;
static size_t minBytes = 32*1024*1024;
static size_t maxBytes = 32*1024*1024;
static size_t stepBytes = 1*1024*1024;
Expand Down Expand Up @@ -432,7 +433,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
int device;
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
CUDACHECK(cudaSetDevice(device));
CUDACHECK(cudaSetDevice(device % nGpusVisible));
void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
maxDelta = std::max(*(args->deltaHost), maxDelta);
Expand Down Expand Up @@ -788,7 +789,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
// will be done on the current GPU (by default : 0) and if the GPUs are in
// exclusive mode those operations will fail.
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
return testSuccess;
}
Expand All @@ -805,7 +806,7 @@ testResult_t threadInit(struct threadArgs* args) {
for (int i=0; i<args->nGpus; i++) {
int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
}
NCCLCHECK(ncclGroupEnd());
Expand Down Expand Up @@ -886,6 +887,8 @@ int main(int argc, char* argv[]) {
{}
};

CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

while(1) {
int c;
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex);
Expand All @@ -899,6 +902,10 @@ int main(int argc, char* argv[]) {
break;
case 'g':
nGpus = strtol(optarg, NULL, 0);
if (nGpus > nGpusVisible) {
fprintf(stderr, "invalid number of GPUs specified (%d), only for %d GPUs exist\n", nGpus, nGpusVisible);
return -1;
}
break;
case 'b':
parsed = parsesize(optarg);
Expand Down Expand Up @@ -1042,7 +1049,7 @@ testResult_t run() {
int cudaDev = localRank*nThreads*nGpus+i;
int rank = proc*nThreads*nGpus+i;
cudaDeviceProp prop;
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev % nGpus));
len += snprintf(line+len, MAX_LINE-len, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
maxMem = std::min(maxMem, prop.totalGlobalMem);
Expand Down Expand Up @@ -1086,7 +1093,7 @@ testResult_t run() {
ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads);

for (int i=0; i<nGpus*nThreads; i++) {
CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
CUDACHECK(cudaSetDevice((localRank*nThreads*nGpus+i) % nGpusVisible));
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus));
CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
}
Expand All @@ -1101,7 +1108,7 @@ testResult_t run() {
} else {
NCCLCHECK(ncclGroupStart());
for (int i=0; i<nGpus*nThreads; i++) {
CUDACHECK(cudaSetDevice(localRank*nThreads*nGpus+i));
CUDACHECK(cudaSetDevice((localRank*nThreads*nGpus+i) % nGpusVisible));
NCCLCHECK(ncclCommInitRank(comms+i, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+i));
}
NCCLCHECK(ncclGroupEnd());
Expand Down
4 changes: 3 additions & 1 deletion src/gather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
Expand Down
4 changes: 3 additions & 1 deletion src/hypercube.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
Expand Down
4 changes: 3 additions & 1 deletion src/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
Expand Down
4 changes: 3 additions & 1 deletion src/reduce_scatter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
Expand Down
4 changes: 3 additions & 1 deletion src/scatter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *param
testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
Expand Down
4 changes: 3 additions & 1 deletion src/sendrecv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
int nGpusVisible;
CUDACHECK(cudaGetDeviceCount(&nGpusVisible));

for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
CUDACHECK(cudaSetDevice(gpuid));
CUDACHECK(cudaSetDevice(gpuid % nGpusVisible));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
Expand Down

0 comments on commit 1c00538

Please sign in to comment.