diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..f91a43f
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,14 @@
+FROM nvcr.io/nvidia/pytorch:23.11-py3
+
+WORKDIR /
+
+# change the download source of apt, comment it out if you are abroad
+COPY sources.list /etc/apt/sources.list
+RUN apt-get update && \
+    apt-get install -y openssh-server vim curl inetutils-ping net-tools telnet lsof
+
+COPY start.sh /start.sh
+COPY sshd_config /etc/ssh/sshd_config
+COPY nccl-tests /nccl-tests
+
+CMD ["/bin/bash", "start.sh"]
\ No newline at end of file
diff --git a/README.md b/README.md
index de5ffbf..f75a3f6 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,73 @@
-# build-nccl-tests-with-pytorch
-This is a dockerfile to build PyTorch executing NCCL-Tests. 
+# Build-NCCL-Tests-With-PyTorch
+
+![license](https://img.shields.io/hexpm/l/plug.svg)
+[![docker](https://img.shields.io/docker/pulls/mayooot/nccl-tests-with-pytorch.svg)](https://hub.docker.com/r/mayooot/nccl-tests-with-pytorch)
+
+# Overview
+
+Build [NCCL-Tests](https://github.com/NVIDIA/nccl-tests) and configure SSHD in PyTorch container to help you test NCCL
+faster!
+
+PyTorch Version: 23.11
+
+# Quick Start
+
+~~~shell
+docker pull mayooot/nccl-tests-with-pytorch:v0.0.1
+~~~
+
+# Build From Source
+
+~~~shell
+git clone https://github.com/mayooot/build-nccl-tests-with-pytorch
+cd build-nccl-tests-with-pytorch
+
+docker build -t nccl-tests-with-pytorch:latest .
+~~~
+
+# Usage
+
+The default values for `PORT` and `PASS` are 12345, you can replace them with `-e`.
+
+In addition, you need to mount the host's `id_rsa` and `id_rsa.pub` to the container.
+
+~~~shell
+docker run --name foo \
+  -d -it \
+  --network=host \
+  -e PORT=1998 -e PASS=P@88w0rd \
+  -v /tmp/id_rsa:/root/.ssh/id_rsa \
+  -v /tmp/id_rsa.pub:/root/.ssh/id_rsa.pub \
+  --gpus all --shm-size=1g \
+  --cap-add=IPC_LOCK --device=/dev/infiniband \
+  mayooot/nccl-tests-with-pytorch:v0.0.1 
+~~~
+
+The code and executable for NCCL-Tests is located in `/nccl-tests`, so let me show you how to use it,
+using `all_reduce_perf` as an example.
+
+Before using `all_reduce_perf`, you need to configure SSH intercommunication.
+
+~~~shell
+ssh-copy-id -p 1998 root@all_cluster_ip
+~~~
+
+Please replace `--host cluster_ip1,cluster_ip2,...` to the real cluster's IP address.
+
+~~~shell
+docker exec -it foo bash
+
+cd /nccl-tests
+
+mpirun --allow-run-as-root \
+  -mca plm_rsh_args "-p 1998" \
+  -x NCCL_DEBUG=INFO \
+  -x NCCL_IB_HCA=mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17 \
+  --host cluster_ip1,cluster_ip2,... \
+  ./build/all_reduce_perf \
+  -b 1G -e 4G -f 2 -g 8
+~~~
+
+# Contribute
+
+Feel free to open issues and pull requests. Any feedback is highly appreciated!
\ No newline at end of file
diff --git a/nccl-tests/LICENSE.txt b/nccl-tests/LICENSE.txt
new file mode 100644
index 0000000..4573c07
--- /dev/null
+++ b/nccl-tests/LICENSE.txt
@@ -0,0 +1,27 @@
+
+ Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/nccl-tests/Makefile b/nccl-tests/Makefile
new file mode 100644
index 0000000..f652b78
--- /dev/null
+++ b/nccl-tests/Makefile
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENCE.txt for license information
+#
+
+BUILDDIR ?= build
+override BUILDDIR := $(abspath $(BUILDDIR))
+
+.PHONY: all clean
+
+default: src.build
+
+TARGETS=src
+
+all:   ${TARGETS:%=%.build}
+clean: ${TARGETS:%=%.clean}
+
+%.build:
+	${MAKE} -C $* build BUILDDIR=${BUILDDIR}
+
+%.clean:
+	${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
diff --git a/nccl-tests/README.md b/nccl-tests/README.md
new file mode 100644
index 0000000..4281799
--- /dev/null
+++ b/nccl-tests/README.md
@@ -0,0 +1,72 @@
+# NCCL Tests
+
+These tests check both the performance and the correctness of [NCCL](http://github.com/nvidia/nccl) operations.
+
+## Build
+
+To build the tests, just type `make`.
+
+If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME.
+
+```shell
+$ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
+```
+
+NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
+
+```shell
+$ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl
+```
+
+## Usage
+
+NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
+
+### Quick examples
+
+Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+```shell
+$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
+```
+
+Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
+```shell
+$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
+```
+
+### Performance
+
+See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.
+
+### Arguments
+
+All tests support the same set of arguments :
+
+* Number of GPUs
+  * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
+  * `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
+* Sizes to scan
+  * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
+  * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
+  * Increments can be either fixed or a multiplication factor. Only one of those should be used
+    * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
+    * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
+* NCCL operations arguments
+  * `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
+  * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
+  * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
+* Performance
+  * `-n,--iters <iteration count>` number of iterations. Default : 20.
+  * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
+  * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
+* Test operation
+  * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
+  * `-c,--check <check iteration count>` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1.
+  * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
+  * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+
+## Copyright
+
+NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+
diff --git a/nccl-tests/build/all_gather_perf b/nccl-tests/build/all_gather_perf
new file mode 100755
index 0000000..2365e17
Binary files /dev/null and b/nccl-tests/build/all_gather_perf differ
diff --git a/nccl-tests/build/all_reduce_perf b/nccl-tests/build/all_reduce_perf
new file mode 100755
index 0000000..5e1cac3
Binary files /dev/null and b/nccl-tests/build/all_reduce_perf differ
diff --git a/nccl-tests/build/alltoall_perf b/nccl-tests/build/alltoall_perf
new file mode 100755
index 0000000..308cf9b
Binary files /dev/null and b/nccl-tests/build/alltoall_perf differ
diff --git a/nccl-tests/build/broadcast_perf b/nccl-tests/build/broadcast_perf
new file mode 100755
index 0000000..2af5a3c
Binary files /dev/null and b/nccl-tests/build/broadcast_perf differ
diff --git a/nccl-tests/build/gather_perf b/nccl-tests/build/gather_perf
new file mode 100755
index 0000000..afaaab0
Binary files /dev/null and b/nccl-tests/build/gather_perf differ
diff --git a/nccl-tests/build/hypercube_perf b/nccl-tests/build/hypercube_perf
new file mode 100755
index 0000000..c504e14
Binary files /dev/null and b/nccl-tests/build/hypercube_perf differ
diff --git a/nccl-tests/build/reduce_perf b/nccl-tests/build/reduce_perf
new file mode 100755
index 0000000..b5e90c7
Binary files /dev/null and b/nccl-tests/build/reduce_perf differ
diff --git a/nccl-tests/build/reduce_scatter_perf b/nccl-tests/build/reduce_scatter_perf
new file mode 100755
index 0000000..8c13291
Binary files /dev/null and b/nccl-tests/build/reduce_scatter_perf differ
diff --git a/nccl-tests/build/scatter_perf b/nccl-tests/build/scatter_perf
new file mode 100755
index 0000000..4302642
Binary files /dev/null and b/nccl-tests/build/scatter_perf differ
diff --git a/nccl-tests/build/sendrecv_perf b/nccl-tests/build/sendrecv_perf
new file mode 100755
index 0000000..d432bc9
Binary files /dev/null and b/nccl-tests/build/sendrecv_perf differ
diff --git a/nccl-tests/build/timer.o b/nccl-tests/build/timer.o
new file mode 100644
index 0000000..2b159c4
Binary files /dev/null and b/nccl-tests/build/timer.o differ
diff --git a/nccl-tests/build/verifiable/verifiable.o b/nccl-tests/build/verifiable/verifiable.o
new file mode 100644
index 0000000..84ab6f1
Binary files /dev/null and b/nccl-tests/build/verifiable/verifiable.o differ
diff --git a/nccl-tests/doc/PERFORMANCE.md b/nccl-tests/doc/PERFORMANCE.md
new file mode 100644
index 0000000..21fef60
--- /dev/null
+++ b/nccl-tests/doc/PERFORMANCE.md
@@ -0,0 +1,144 @@
+# Performance reported by NCCL tests
+
+NCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
+
+# Time
+
+Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations.
+
+On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but
+also the bandwidth multiplied by the size.
+
+Therefore, on large sizes, it makes more sense to look at the bandwidth.
+
+# Bandwidth
+
+## Algorithm bandwidth
+
+Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth.
+
+`algbw = S/t`
+
+## Bus bandwidth
+
+While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks.
+Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase).
+
+To provide a number which reflects how optimally the hardware is used, NCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
+This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication.
+Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+
+The formula depends on the collective operation.
+
+### AllReduce
+
+An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation :
+
+`o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}`
+
+**Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).**
+
+A ring would do that operation in an order which follows the ring :
+
+`i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}`
+
+A tree would do it hierarchically :
+
+`(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
+
+In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output),
+we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation.
+
+Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best :
+
+ `t = (S*2*(n-1)) / (n*B)`
+
+Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them.
+Reordering the equation, we find that
+
+ `t = (S/B) * (2*(n-1)/n)`
+
+Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute :
+
+ `B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)`
+
+### ReduceScatter
+
+The ReduceScatter operation requires only to perform the addition part of the allReduce operation :
+
+ `o_K = i_0 + i_1 + i_2 + ... + i_{n-1}`
+
+With K being the rank which is getting the final result(K=offset/recvsize).
+
+The perfect reduceScatter time with a rank bandwidth of B would therefore be :
+
+ `t = S*(n-1) / (B*n)`
+
+And the Bus Bandwidth is therefore computed as :
+
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+
+Note that here, S is the size in bytes of the total array, which for NCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
+
+### AllGather
+
+The AllGather operation requires only to perform the assignment part of the allReduce operation :
+
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K`
+
+With K being the rank where the data originates from (K=offset*sendsize).
+
+The perfect allGather time with a rank bandwidth of B would therefore be :
+
+ `t = S*(n-1) / (B*n)`
+
+And the Bus Bandwidth is therefore computed as :
+
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+
+Note that here, S is the size in bytes of the total array, which for NCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
+
+### Broadcast
+
+The broadcast operation representation is similar to allGather :
+
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_R`
+
+R being the root of the operation.
+
+However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations.
+Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out :
+
+ `t = S/B`
+
+And :
+
+ `B = S/t`
+
+### Reduce
+
+The reduce operation performs :
+
+ `o_R = i_0 + i_1 + i_2 + ... + i_{n-1}`
+
+R being the root of the operation.
+
+Similarly to broadcast, all data need to be sent to the root, hence :
+
+ `t = S/B`
+
+And :
+
+ `B = S/t`
+
+### Summary
+
+To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth :
+
+* AllReduce : 2*(_n_-1)/_n_
+* ReduceScatter : (_n_-1)/_n_
+* AllGather : (_n_-1)/_n_
+* Broadcast : 1
+* Reduce : 1
+
+The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
diff --git a/nccl-tests/src/Makefile b/nccl-tests/src/Makefile
new file mode 100644
index 0000000..393de8e
--- /dev/null
+++ b/nccl-tests/src/Makefile
@@ -0,0 +1,105 @@
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+NVCC ?= $(CUDA_HOME)/bin/nvcc
+CUDARTLIB ?= cudart
+
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_80,code=sm_80 \
+                -gencode=arch=compute_80,code=compute_80
+else
+NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61 \
+                -gencode=arch=compute_70,code=sm_70 \
+                -gencode=arch=compute_70,code=compute_70
+endif
+
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
+CXXFLAGS   := -std=c++11
+
+LDFLAGS    := -L${CUDA_LIB} -lcudart -lrt
+NVLDFLAGS  := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3 -g
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
+else
+.SILENT:
+endif
+
+.PHONY: build clean
+
+BUILDDIR ?= ../build
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+ifeq ($(MPI), 1)
+NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include
+NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi
+endif
+ifeq ($(MPI_IBM),1)
+NVCUFLAGS += -DMPI_SUPPORT
+NVLDFLAGS += -lmpi_ibm
+endif
+LIBRARIES += nccl
+NVLDFLAGS += $(LIBRARIES:%=-l%)
+
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+
+build: ${BIN_FILES}
+
+clean:
+	rm -rf ${DST_DIR}
+
+TEST_VERIFIABLE_SRCDIR := ../verifiable
+TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
+include ../verifiable/verifiable.mk
+
+${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $<
+
+${DST_DIR}/timer.o: timer.cc timer.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
+
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
+
diff --git a/nccl-tests/src/all_gather.cu b/nccl-tests/src/all_gather.cu
new file mode 100644
index 0000000..0831207
--- /dev/null
+++ b/nccl-tests/src/all_gather.cu
@@ -0,0 +1,93 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+#define ALIGN 4
+
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
+  *recvInplaceOffset = 0;
+  *paramcount = base;
+}
+
+testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
+  return testSuccess;
+}
+
+struct testColl allGatherTest = {
+  "AllGather",
+  AllGatherGetCollByteCount,
+  AllGatherInitData,
+  AllGatherGetBw,
+  AllGatherRunColl
+};
+
+void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allGatherTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine allGatherEngine = {
+  AllGatherGetBuffSize,
+  AllGatherRunTest
+};
+
+#pragma weak ncclTestEngine=allGatherEngine
diff --git a/nccl-tests/src/all_reduce.cu b/nccl-tests/src/all_reduce.cu
new file mode 100644
index 0000000..a38eabe
--- /dev/null
+++ b/nccl-tests/src/all_reduce.cu
@@ -0,0 +1,101 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine allReduceEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
+
+#pragma weak ncclTestEngine=allReduceEngine
diff --git a/nccl-tests/src/alltoall.cu b/nccl-tests/src/alltoall.cu
new file mode 100644
index 0000000..41c7c4a
--- /dev/null
+++ b/nccl-tests/src/alltoall.cu
@@ -0,0 +1,107 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks;
+}
+
+testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+    for (int j=0; j<nranks; j++) {
+      size_t partcount = sendcount/nranks;
+      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place alltoall
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  size_t rankOffset = count * wordSize(type);
+
+#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
+  printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
+  return testNcclError;
+#else
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nRanks; r++) {
+    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+  }
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+#endif
+}
+
+struct testColl alltoAllTest = {
+  "AlltoAll",
+  AlltoAllGetCollByteCount,
+  AlltoAllInitData,
+  AlltoAllGetBw,
+  AlltoAllRunColl
+};
+
+void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &alltoAllTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine alltoAllEngine = {
+  AlltoAllGetBuffSize,
+  AlltoAllRunTest
+};
+
+#pragma weak ncclTestEngine=alltoAllEngine
diff --git a/nccl-tests/src/broadcast.cu b/nccl-tests/src/broadcast.cu
new file mode 100644
index 0000000..903066a
--- /dev/null
+++ b/nccl-tests/src/broadcast.cu
@@ -0,0 +1,107 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+    TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+  NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream));
+#else
+  if (rank == root) {
+      NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream));
+  } else {
+      NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream));
+  }
+#endif
+  return testSuccess;
+}
+
+struct testColl broadcastTest = {
+  "Broadcast",
+  BroadcastGetCollByteCount,
+  BroadcastInitData,
+  BroadcastGetBw,
+  BroadcastRunColl
+};
+
+void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &broadcastTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine broadcastEngine = {
+  BroadcastGetBuffSize,
+  BroadcastRunTest
+};
+
+#pragma weak ncclTestEngine=broadcastEngine
diff --git a/nccl-tests/src/common.cu b/nccl-tests/src/common.cu
new file mode 100644
index 0000000..8588047
--- /dev/null
+++ b/nccl-tests/src/common.cu
@@ -0,0 +1,1071 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include <pthread.h>
+#include <cstdio>
+#include <type_traits>
+#include <getopt.h>
+#include <libgen.h>
+#include "cuda.h"
+
+#include "../verifiable/verifiable.h"
+
+int test_ncclVersion = 0; // init'd with ncclGetVersion()
+
+#if NCCL_MAJOR >= 2
+  ncclDataType_t test_types[ncclNumTypes] = {
+    ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclBfloat16
+  #endif
+  };
+  const char *test_typenames[ncclNumTypes] = {
+    "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
+  #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , "bfloat16"
+  #endif
+  };
+  int test_typenum = -1;
+
+  const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+    , ncclAvg
+  #endif
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand
+  #endif
+  };
+  int test_opnum = -1;
+#else
+  ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64};
+  const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"};
+  int test_typenum = 7;
+  const char *test_opnames[] = {"sum", "prod", "max", "min"};
+  ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin};
+  int test_opnum = 4;
+#endif
+
+// For libnccl's < 2.13
+extern "C" __attribute__((weak)) char const* ncclGetLastError(ncclComm_t comm) {
+  return "";
+}
+
+int is_main_proc = 0;
+thread_local int is_main_thread = 0;
+
+// Command line parameter defaults
+static int nThreads = 1;
+static int nGpus = 1;
+static size_t minBytes = 32*1024*1024;
+static size_t maxBytes = 32*1024*1024;
+static size_t stepBytes = 1*1024*1024;
+static size_t stepFactor = 1;
+static int datacheck = 1;
+static int warmup_iters = 5;
+static int iters = 20;
+static int agg_iters = 1;
+static int ncclop = ncclSum;
+static int nccltype = ncclFloat;
+static int ncclroot = 0;
+static int parallel_init = 0;
+static int blocking_coll = 0;
+static int streamnull = 0;
+static int timeout = 0;
+static int cudaGraphLaunches = 0;
+static int report_cputime = 0;
+// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
+static int average = 1;
+
+#define NUM_BLOCKS 32
+
+static double parsesize(const char *value) {
+    long long int units;
+    double size;
+    char size_lit;
+
+    int count = sscanf(value, "%lf %1s", &size, &size_lit);
+
+    switch (count) {
+    case 2:
+      switch (size_lit) {
+      case 'G':
+      case 'g':
+        units = 1024*1024*1024;
+        break;
+      case 'M':
+      case 'm':
+        units = 1024*1024;
+        break;
+      case 'K':
+      case 'k':
+        units = 1024;
+        break;
+      default:
+        return -1.0;
+      };
+      break;
+    case 1:
+      units = 1;
+      break;
+    default:
+      return -1.0;
+    }
+
+    return size * units;
+}
+
+testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
+  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
+  CUDACHECK(cudaDeviceSynchronize());
+  return testSuccess;
+}
+
+testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
+  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
+  return testSuccess;
+}
+
+testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
+  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
+  return testSuccess;
+}
+
+void Barrier(struct threadArgs *args) {
+  thread_local int epoch = 0;
+  static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
+  static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
+  static int counter[2] = {0, 0};
+
+  pthread_mutex_lock(&lock[epoch]);
+  if(++counter[epoch] == args->nThreads)
+    pthread_cond_broadcast(&cond[epoch]);
+
+  if(args->thread+1 == args->nThreads) {
+    while(counter[epoch] != args->nThreads)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+    #ifdef MPI_SUPPORT
+      MPI_Barrier(MPI_COMM_WORLD);
+    #endif
+    counter[epoch] = 0;
+    pthread_cond_broadcast(&cond[epoch]);
+  }
+  else {
+    while(counter[epoch] != 0)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+  }
+  pthread_mutex_unlock(&lock[epoch]);
+  epoch ^= 1;
+}
+
+// Inter-thread/process barrier+allreduce. The quality of the return value
+// for average=0 (which means broadcast from rank=0) is dubious. The returned
+// value will actually be the result of process-local broadcast from the local thread=0.
+template<typename T>
+void Allreduce(struct threadArgs* args, T* value, int average) {
+  thread_local int epoch = 0;
+  static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
+  static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
+  static T accumulator[2];
+  static int counter[2] = {0, 0};
+
+  pthread_mutex_lock(&lock[epoch]);
+  if(counter[epoch] == 0) {
+    if(average != 0 || args->thread == 0) accumulator[epoch] = *value;
+  } else {
+    switch(average) {
+    case /*r0*/ 0: if(args->thread == 0) accumulator[epoch] = *value; break;
+    case /*avg*/1: accumulator[epoch] += *value; break;
+    case /*min*/2: accumulator[epoch] = std::min<T>(accumulator[epoch], *value); break;
+    case /*max*/3: accumulator[epoch] = std::max<T>(accumulator[epoch], *value); break;
+    case /*sum*/4: accumulator[epoch] += *value; break;
+    }
+  }
+
+  if(++counter[epoch] == args->nThreads)
+    pthread_cond_broadcast(&cond[epoch]);
+
+  if(args->thread+1 == args->nThreads) {
+    while(counter[epoch] != args->nThreads)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+
+    #ifdef MPI_SUPPORT
+    if(average != 0) {
+      static_assert(std::is_same<T, long long>::value || std::is_same<T, double>::value, "Allreduce<T> only for T in {long long, double}");
+      MPI_Datatype ty = std::is_same<T, long long>::value ? MPI_LONG_LONG :
+                        std::is_same<T, double>::value ? MPI_DOUBLE :
+                        MPI_Datatype();
+      MPI_Op op = average == 1 ? MPI_SUM :
+                  average == 2 ? MPI_MIN :
+                  average == 3 ? MPI_MAX :
+                  average == 4 ? MPI_SUM : MPI_Op();
+      MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator[epoch], 1, ty, op, MPI_COMM_WORLD);
+    }
+    #endif
+
+    if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads;
+    counter[epoch] = 0;
+    pthread_cond_broadcast(&cond[epoch]);
+  }
+  else {
+    while(counter[epoch] != 0)
+      pthread_cond_wait(&cond[epoch], &lock[epoch]);
+  }
+  pthread_mutex_unlock(&lock[epoch]);
+
+  *value = accumulator[epoch];
+  epoch ^= 1;
+}
+
+testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int64_t *wrongElts) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
+  size_t count = args->expectedBytes/wordSize(type);
+
+  int64_t *wrongPerGpu = nullptr;
+  CUDACHECK(cudaHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
+
+  for (int i=0; i<args->nGpus; i++) {
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
+
+    TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
+
+#if 1 && DEBUG_PRINT
+    if (args->reportErrors && wrongPerGpu[i] != 0) {
+      printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
+      char *expectedHost = (char*)malloc(args->expectedBytes);
+      char *dataHost = (char*)malloc(args->expectedBytes);
+      int eltsz = wordSize(type);
+      cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost);
+      cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
+
+      for(int j=0; j<args->expectedBytes/eltsz; j++) {
+        unsigned long long want, got;
+        want = 0;
+        memcpy(&want, expectedHost + j*eltsz, eltsz);
+        got = 0;
+        memcpy(&got, dataHost + j*eltsz, eltsz);
+        if(want != got) {
+          printf(" rank=%d elt[%d]: want=0x%llx got=0x%llx\n", rank, j, want, got);
+        }
+      }
+      free(expectedHost);
+      free(dataHost);
+    }
+#endif
+  }
+
+  *wrongElts = 0;
+  for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
+  cudaFreeHost(wrongPerGpu);
+
+  if (args->reportErrors && *wrongElts) args->errors[0]++;
+  return testSuccess;
+}
+
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
+  int remaining = ngpus;
+  int* done = (int*)malloc(sizeof(int)*ngpus);
+  memset(done, 0, sizeof(int)*ngpus);
+  timer tim;
+
+  while (remaining) {
+   int idle = 1;
+   for (int i=0; i<ngpus; i++) {
+     if (done[i]) continue;
+
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
+       done[i] = 1;
+       remaining--;
+       idle = 0;
+       continue;
+     }
+
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
+       ncclResult_t ncclAsyncErr;
+       NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr));
+       if (ncclAsyncErr != ncclSuccess) {
+         // An asynchronous error happened. Stop the operation and destroy
+         // the communicator
+         for (int i=0; i<ngpus; i++)
+           NCCLCHECK(ncclCommAbort(comms[i]));
+         // Abort the perf test
+         NCCLCHECK(ncclAsyncErr);
+       }
+     }
+     double delta = tim.elapsed();
+     if (delta > timeout && timeout > 0) {
+       for (int i=0; i<ngpus; i++)
+         NCCLCHECK(ncclCommAbort(comms[i]));
+       char hostname[1024];
+       getHostName(hostname, 1024);
+       printf("%s: Test timeout (%ds) %s:%d\n",
+           hostname,
+           timeout,
+           __FILE__,__LINE__);
+       free(done);
+       return testTimeout;
+     }
+#endif
+   }
+
+   // We might want to let other threads (including NCCL threads) use the CPU.
+   if (idle) sched_yield();
+  }
+  free(done);
+  return testSuccess;
+}
+
+testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) {
+  size_t count = args->nbytes / wordSize(type);
+
+  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);
+
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < args->nGpus; i++) {
+#ifndef NCCL_MAJOR
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+#endif
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
+    char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
+    ncclRedOp_t op;
+
+    if(opIndex < ncclNumOps) {
+      op = opIndex;
+    }
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    else {
+      union {
+        int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
+        half f16; float f32; double f64;
+        #if defined(__CUDA_BF16_TYPES_EXIST__)
+        __nv_bfloat16 bf16;
+        #endif
+      };
+      switch(type) {
+      case ncclInt8: i8 = ncclVerifiablePremulScalar<int8_t>(rank); break;
+      case ncclUint8: u8 = ncclVerifiablePremulScalar<uint8_t>(rank); break;
+      case ncclInt32: i32 = ncclVerifiablePremulScalar<int32_t>(rank); break;
+      case ncclUint32: u32 = ncclVerifiablePremulScalar<uint32_t>(rank); break;
+      case ncclInt64: i64 = ncclVerifiablePremulScalar<int64_t>(rank); break;
+      case ncclUint64: u64 = ncclVerifiablePremulScalar<uint64_t>(rank); break;
+      case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
+      case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
+      case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
+      #endif
+      }
+      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
+    }
+    #endif
+
+    TESTCHECK(args->collTest->runColl(
+          (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
+          (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
+        count, type, op, root, args->comms[i], args->streams[i]));
+
+    #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+    if(opIndex >= ncclNumOps) {
+      NCCLCHECK(ncclRedOpDestroy(op, args->comms[i]));
+    }
+    #endif
+  }
+  if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd());
+
+  if (blocking_coll) {
+    // Complete op before returning
+    TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  }
+  if (blocking_coll) Barrier(args);
+  return testSuccess;
+}
+
+testResult_t completeColl(struct threadArgs* args) {
+  if (blocking_coll) return testSuccess;
+
+  TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms));
+  return testSuccess;
+}
+
+testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
+  size_t count = args->nbytes / wordSize(type);
+  if (datacheck) {
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
+  }
+
+  // Sync
+  TESTCHECK(startColl(args, type, op, root, in_place, 0));
+  TESTCHECK(completeColl(args));
+
+  Barrier(args);
+
+#if CUDART_VERSION >= 11030
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
+  if (cudaGraphLaunches >= 1) {
+    // Begin cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      // Thread local mdoe is needed for:
+      // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
+      // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
+      //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
+    }
+  }
+#endif
+
+  // Performance Benchmark
+  timer tim;
+  for (int iter = 0; iter < iters; iter++) {
+    if (agg_iters>1) NCCLCHECK(ncclGroupStart());
+    for (int aiter = 0; aiter < agg_iters; aiter++) {
+      TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter));
+    }
+    if (agg_iters>1) NCCLCHECK(ncclGroupEnd());
+  }
+
+#if CUDART_VERSION >= 11030
+  if (cudaGraphLaunches >= 1) {
+    // End cuda graph capture
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+    }
+    // Instantiate cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+    }
+    // Resync CPU, restart timing, launch cuda graph
+    Barrier(args);
+    tim.reset();
+    for (int l=0; l<cudaGraphLaunches; l++) {
+      for (int i=0; i<args->nGpus; i++) {
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+      }
+    }
+  }
+#endif
+
+  double cputimeSec = tim.elapsed()/(iters*agg_iters);
+  TESTCHECK(completeColl(args));
+
+  double deltaSec = tim.elapsed();
+  deltaSec = deltaSec/(iters*agg_iters);
+  if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
+  Allreduce(args, &deltaSec, average);
+
+#if CUDART_VERSION >= 11030
+  if (cudaGraphLaunches >= 1) {
+    //destroy cuda graph
+    for (int i=0; i<args->nGpus; i++) {
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
+    }
+  }
+#endif
+
+  double algBw, busBw;
+  args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus);
+
+  Barrier(args);
+
+  int64_t wrongElts = 0;
+  static __thread int rep = 0;
+  rep++;
+  for (int c = 0; c < datacheck; c++) {
+      // Initialize sendbuffs, recvbuffs and expected
+      TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place));
+
+#if CUDART_VERSION >= 11030
+      if (cudaGraphLaunches >= 1) {
+        // Begin cuda graph capture for data check
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
+        }
+      }
+#endif
+
+      //test validation in single itertion, should ideally be included into the multi-iteration run
+      TESTCHECK(startColl(args, type, op, root, in_place, 0));
+
+#if CUDART_VERSION >= 11030
+      if (cudaGraphLaunches >= 1) {
+        // End cuda graph capture
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
+        }
+        // Instantiate cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+        }
+        // Launch cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
+        }
+      }
+#endif
+
+      TESTCHECK(completeColl(args));
+
+#if CUDART_VERSION >= 11030
+      if (cudaGraphLaunches >= 1) {
+        //destroy cuda graph
+        for (int i=0; i<args->nGpus; i++) {
+          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+          CUDACHECK(cudaGraphDestroy(graphs[i]));
+        }
+      }
+#endif
+
+      TESTCHECK(CheckData(args, type, op, root, in_place, &wrongElts));
+
+      //aggregate delta from all threads and procs
+      long long wrongElts1 = wrongElts;
+      //if (wrongElts) fprintf(stderr, "\nERROR: Data corruption : rank %d size %ld wrongElts %ld\n", args->proc, args->expectedBytes, wrongElts);
+      Allreduce(args, &wrongElts1, /*sum*/4);
+      wrongElts = wrongElts1;
+      if (wrongElts) break;
+  }
+
+  double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
+  char timeStr[100];
+  if (timeUsec >= 10000.0) {
+    sprintf(timeStr, "%7.0f", timeUsec);
+  } else if (timeUsec >= 100.0) {
+    sprintf(timeStr, "%7.1f", timeUsec);
+  } else {
+    sprintf(timeStr, "%7.2f", timeUsec);
+  }
+  if (args->reportErrors) {
+    PRINT("  %7s  %6.2f  %6.2f  %5g", timeStr, algBw, busBw, (double)wrongElts);
+  } else {
+    PRINT("  %7s  %6.2f  %6.2f  %5s", timeStr, algBw, busBw, "N/A");
+  }
+
+  args->bw[0] += busBw;
+  args->bw_count[0]++;
+  return testSuccess;
+}
+
+void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
+  int nranks = args->nProcs*args->nGpus*args->nThreads;
+  size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset;
+
+  count = size / wordSize(type);
+  args->collTest->getCollByteCount(&sendCount, &recvCount, &paramCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks);
+
+  args->nbytes = paramCount * wordSize(type);
+  args->sendBytes = sendCount * wordSize(type);
+  args->expectedBytes = recvCount * wordSize(type);
+  args->sendInplaceOffset = sendInplaceOffset * wordSize(type);
+  args->recvInplaceOffset = recvInplaceOffset * wordSize(type);
+}
+
+testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
+  // Sync to avoid first-call timeout
+  Barrier(args);
+
+  // Warm-up for large size
+  setupArgs(args->maxbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    TESTCHECK(startColl(args, type, op, root, 0, iter));
+  }
+  TESTCHECK(completeColl(args));
+
+  // Warm-up for small size
+  setupArgs(args->minbytes, type, args);
+  for (int iter = 0; iter < warmup_iters; iter++) {
+    TESTCHECK(startColl(args, type, op, root, 0, iter));
+  }
+  TESTCHECK(completeColl(args));
+
+  // Benchmark
+  for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
+      setupArgs(size, type, args);
+      char rootName[100];
+      sprintf(rootName, "%6i", root);
+      PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
+      TESTCHECK(BenchTime(args, type, op, root, 0));
+      TESTCHECK(BenchTime(args, type, op, root, 1));
+      PRINT("\n");
+  }
+  return testSuccess;
+}
+
+testResult_t threadRunTests(struct threadArgs* args) {
+  // Set device to the first of our GPUs. If we don't do that, some operations
+  // will be done on the current GPU (by default : 0) and if the GPUs are in
+  // exclusive mode those operations will fail.
+  CUDACHECK(cudaSetDevice(args->gpus[0]));
+  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
+  return testSuccess;
+}
+
+testResult_t threadInit(struct threadArgs* args) {
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int nranks =  args->nProcs*args->nThreads*args->nGpus;
+
+  //set main thread again
+  is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0;
+
+  NCCLCHECK(ncclGroupStart());
+  for (int i=0; i<args->nGpus; i++) {
+    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  TESTCHECK(threadRunTests(args));
+
+  for (int i=0; i<args->nGpus; i++) {
+    NCCLCHECK(ncclCommDestroy(args->comms[i]));
+  }
+  return testSuccess;
+}
+
+void* threadLauncher(void* thread_) {
+  struct testThread* thread = (struct testThread*)thread_;
+  thread->ret = thread->func(&thread->args);
+  return NULL;
+}
+testResult_t threadLaunch(struct testThread* thread) {
+  pthread_create(&thread->thread, NULL, threadLauncher, thread);
+  return testSuccess;
+}
+
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
+    return testSuccess;
+}
+
+testResult_t run(); // Main function
+
+int main(int argc, char* argv[]) {
+  // Make sure everyline is flushed so that we see the progress of the test
+  setlinebuf(stdout);
+
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+    ncclGetVersion(&test_ncclVersion);
+  #else
+    test_ncclVersion = NCCL_VERSION_CODE;
+  #endif
+  //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion);
+  #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0)
+    test_opnum = 4;
+    test_typenum = 9;
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
+      test_opnum++; // ncclAvg
+      #if defined(__CUDA_BF16_TYPES_EXIST__)
+        test_typenum++; // bfloat16
+      #endif
+    }
+    if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
+      test_opnum++; // PreMulSum
+    }
+  #endif
+
+  // Parse args
+  double parsed;
+  int longindex;
+  static struct option longopts[] = {
+    {"nthreads", required_argument, 0, 't'},
+    {"ngpus", required_argument, 0, 'g'},
+    {"minbytes", required_argument, 0, 'b'},
+    {"maxbytes", required_argument, 0, 'e'},
+    {"stepbytes", required_argument, 0, 'i'},
+    {"stepfactor", required_argument, 0, 'f'},
+    {"iters", required_argument, 0, 'n'},
+    {"agg_iters", required_argument, 0, 'm'},
+    {"warmup_iters", required_argument, 0, 'w'},
+    {"parallel_init", required_argument, 0, 'p'},
+    {"check", required_argument, 0, 'c'},
+    {"op", required_argument, 0, 'o'},
+    {"datatype", required_argument, 0, 'd'},
+    {"root", required_argument, 0, 'r'},
+    {"blocking", required_argument, 0, 'z'},
+    {"stream_null", required_argument, 0, 'y'},
+    {"timeout", required_argument, 0, 'T'},
+    {"cudagraph", required_argument, 0, 'G'},
+    {"report_cputime", required_argument, 0, 'C'},
+    {"average", required_argument, 0, 'a'},
+    {"help", no_argument, 0, 'h'},
+    {}
+  };
+
+  while(1) {
+    int c;
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex);
+
+    if (c == -1)
+      break;
+
+    switch(c) {
+      case 't':
+        nThreads = strtol(optarg, NULL, 0);
+        break;
+      case 'g':
+        nGpus = strtol(optarg, NULL, 0);
+        break;
+      case 'b':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'minbytes'\n");
+          return -1;
+        }
+        minBytes = (size_t)parsed;
+        break;
+      case 'e':
+        parsed = parsesize(optarg);
+        if (parsed < 0) {
+          fprintf(stderr, "invalid size specified for 'maxbytes'\n");
+          return -1;
+        }
+        maxBytes = (size_t)parsed;
+        break;
+      case 'i':
+        stepBytes = strtol(optarg, NULL, 0);
+        break;
+      case 'f':
+        stepFactor = strtol(optarg, NULL, 0);
+        break;
+      case 'n':
+        iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'm':
+#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2)
+        agg_iters = (int)strtol(optarg, NULL, 0);
+#else
+        fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n");
+#endif
+        break;
+      case 'w':
+        warmup_iters = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'c':
+        datacheck = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'p':
+        parallel_init = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'o':
+        ncclop = ncclstringtoop(optarg);
+        break;
+      case 'd':
+        nccltype = ncclstringtotype(optarg);
+        break;
+      case 'r':
+        ncclroot = strtol(optarg, NULL, 0);
+        break;
+      case 'z':
+        blocking_coll = strtol(optarg, NULL, 0);
+        break;
+      case 'y':
+        streamnull = strtol(optarg, NULL, 0);
+        break;
+      case 'T':
+        timeout = strtol(optarg, NULL, 0);
+        break;
+      case 'G':
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030
+        cudaGraphLaunches = strtol(optarg, NULL, 0);
+#else
+        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
+#endif
+        break;
+      case 'C':
+        report_cputime = strtol(optarg, NULL, 0);
+        break;
+      case 'a':
+        average = (int)strtol(optarg, NULL, 0);
+        break;
+      case 'h':
+      default:
+        if (c != 'h') printf("invalid option '%c'\n", c);
+        printf("USAGE: %s \n\t"
+            "[-t,--nthreads <num threads>] \n\t"
+            "[-g,--ngpus <gpus per thread>] \n\t"
+            "[-b,--minbytes <min size in bytes>] \n\t"
+            "[-e,--maxbytes <max size in bytes>] \n\t"
+            "[-i,--stepbytes <increment size>] \n\t"
+            "[-f,--stepfactor <increment factor>] \n\t"
+            "[-n,--iters <iteration count>] \n\t"
+            "[-m,--agg_iters <aggregated iteration count>] \n\t"
+            "[-w,--warmup_iters <warmup iteration count>] \n\t"
+            "[-p,--parallel_init <0/1>] \n\t"
+            "[-c,--check <check iteration count>] \n\t"
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+            "[-o,--op <sum/prod/min/max/avg/mulsum/all>] \n\t"
+#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
+#else
+            "[-o,--op <sum/prod/min/max/all>] \n\t"
+#endif
+            "[-d,--datatype <nccltype/all>] \n\t"
+            "[-r,--root <root>] \n\t"
+            "[-z,--blocking <0/1>] \n\t"
+            "[-y,--stream_null <0/1>] \n\t"
+            "[-T,--timeout <time in seconds>] \n\t"
+            "[-G,--cudagraph <num graph launches>] \n\t"
+            "[-C,--report_cputime <0/1>] \n\t"
+            "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
+            "[-h,--help]\n",
+          basename(argv[0]));
+        return 0;
+    }
+  }
+  if (minBytes > maxBytes) {
+    fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n",
+           (unsigned long long)minBytes,
+           (unsigned long long)maxBytes);
+    return -1;
+  }
+#ifdef MPI_SUPPORT
+  MPI_Init(&argc, &argv);
+#endif
+  TESTCHECK(run());
+  return 0;
+}
+
+testResult_t run() {
+  int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
+  int localRank = 0;
+  char hostname[1024];
+  getHostName(hostname, 1024);
+
+#ifdef MPI_SUPPORT
+  MPI_Comm_size(MPI_COMM_WORLD, &totalProcs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &proc);
+  uint64_t hostHashs[totalProcs];
+  hostHashs[proc] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
+  for (int p=0; p<totalProcs; p++) {
+    if (p == proc) break;
+    if (hostHashs[p] == hostHashs[proc]) localRank++;
+  }
+
+  char* str = getenv("NCCL_TESTS_SPLIT_MASK");
+  uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
+  MPI_Comm mpi_comm;
+  color = proc & mask;
+  MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
+  MPI_Comm_size(mpi_comm, &ncclProcs);
+  MPI_Comm_rank(mpi_comm, &ncclProc);
+#endif
+  is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
+
+  PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
+        nThreads, nGpus, minBytes, maxBytes,
+        (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
+        warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
+  if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
+  if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
+  PRINT("#\n");
+
+  PRINT("# Using devices\n");
+#define MAX_LINE 2048
+  char line[MAX_LINE];
+  int len = 0;
+  size_t maxMem = ~0;
+  char* envstr = getenv("NCCL_TESTS_DEVICE");
+  int gpu0 = envstr ? atoi(envstr) : -1;
+  for (int i=0; i<nThreads*nGpus; i++) {
+    int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
+    int rank = proc*nThreads*nGpus+i;
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
+    len += snprintf(line+len, MAX_LINE-len, "#  Rank %2d Group %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
+                    rank, color, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
+  }
+
+#if MPI_SUPPORT
+  char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL;
+  // Gather all output in rank order to root (0)
+  MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
+  if (proc == 0) {
+    for (int p = 0; p < totalProcs; p++)
+      PRINT("%s", lines+MAX_LINE*p);
+    free(lines);
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
+#else
+  PRINT("%s", line);
+#endif
+
+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
+  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+  }
+
+  ncclUniqueId ncclId;
+  if (ncclProc == 0) {
+    NCCLCHECK(ncclGetUniqueId(&ncclId));
+  }
+#ifdef MPI_SUPPORT
+  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
+  MPI_Barrier(MPI_COMM_WORLD); // Ensure Bcast is complete for HCOLL
+#endif
+  int gpus[nGpus*nThreads];
+  cudaStream_t streams[nGpus*nThreads];
+  void* sendbuffs[nGpus*nThreads];
+  void* recvbuffs[nGpus*nThreads];
+  void* expected[nGpus*nThreads];
+  size_t sendBytes, recvBytes;
+
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads);
+
+  envstr = getenv("NCCL_TESTS_DEVICE");
+  gpu0 = envstr ? atoi(envstr) : -1;
+  for (int i=0; i<nGpus*nThreads; i++) {
+    gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
+    CUDACHECK(cudaSetDevice(gpus[i]));
+    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
+    if (streamnull)
+      streams[i] = NULL;
+    else
+      CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
+  }
+
+  //if parallel init is not selected, use main thread to initialize NCCL
+  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
+  if (!parallel_init) {
+     if (ncclProcs == 1) {
+       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
+     } else {
+       NCCLCHECK(ncclGroupStart());
+       for (int i=0; i<nGpus*nThreads; i++) {
+         CUDACHECK(cudaSetDevice(gpus[i]));
+         NCCLCHECK(ncclCommInitRank(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i));
+       }
+       NCCLCHECK(ncclGroupEnd());
+     }
+  }
+
+  int errors[nThreads];
+  double bw[nThreads];
+  double* delta;
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
+  int bw_count[nThreads];
+  for (int t=0; t<nThreads; t++) {
+    bw[t] = 0.0;
+    errors[t] = bw_count[t] = 0;
+  }
+
+  fflush(stdout);
+
+  const char* timeStr = report_cputime ? "cputime" : "time";
+  PRINT("#\n");
+  PRINT("# %10s  %12s  %8s  %6s  %6s           out-of-place                       in-place          \n", "", "", "", "", "");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s %6s  %7s  %6s  %6s %6s\n", "size", "count", "type", "redop", "root",
+      timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
+  PRINT("# %10s  %12s  %8s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "", "",
+      "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
+
+  struct testThread threads[nThreads];
+  memset(threads, 0, sizeof(struct testThread)*nThreads);
+
+  for (int t=nThreads-1; t>=0; t--) {
+    threads[t].args.minbytes=minBytes;
+    threads[t].args.maxbytes=maxBytes;
+    threads[t].args.stepbytes=stepBytes;
+    threads[t].args.stepfactor=stepFactor;
+    threads[t].args.localRank = localRank;
+
+    threads[t].args.totalProcs=totalProcs;
+    threads[t].args.nProcs=ncclProcs;
+    threads[t].args.proc=ncclProc;
+    threads[t].args.nThreads=nThreads;
+    threads[t].args.thread=t;
+    threads[t].args.nGpus=nGpus;
+    threads[t].args.gpus=gpus+t*nGpus;
+    threads[t].args.sendbuffs = sendbuffs+t*nGpus;
+    threads[t].args.recvbuffs = recvbuffs+t*nGpus;
+    threads[t].args.expected = expected+t*nGpus;
+    threads[t].args.ncclId = ncclId;
+    threads[t].args.comms=comms+t*nGpus;
+    threads[t].args.streams=streams+t*nGpus;
+
+    threads[t].args.errors=errors+t;
+    threads[t].args.bw=bw+t;
+    threads[t].args.bw_count=bw_count+t;
+
+    threads[t].args.reportErrors = datacheck;
+
+    threads[t].func = parallel_init ? threadInit : threadRunTests;
+    if (t)
+      TESTCHECK(threadLaunch(threads+t));
+    else
+      TESTCHECK(threads[t].func(&threads[t].args));
+  }
+
+  // Wait for other threads and accumulate stats and errors
+  for (int t=nThreads-1; t>=0; t--) {
+    if (t) pthread_join(threads[t].thread, NULL);
+    TESTCHECK(threads[t].ret);
+    if (t) {
+      errors[0] += errors[t];
+      bw[0] += bw[t];
+      bw_count[0] += bw_count[t];
+    }
+  }
+
+#ifdef MPI_SUPPORT
+  MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#endif
+
+  if (!parallel_init) {
+    for(int i=0; i<nGpus*nThreads; ++i)
+      NCCLCHECK(ncclCommDestroy(comms[i]));
+    free(comms);
+  }
+
+  // Free off CUDA allocated memory
+  for (int i=0; i<nGpus*nThreads; i++) {
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (datacheck) CUDACHECK(cudaFree(expected[i]));
+  }
+  CUDACHECK(cudaFreeHost(delta));
+
+  envstr = getenv("NCCL_TESTS_MIN_BW");
+  double check_avg_bw = envstr ? atof(envstr) : -1;
+  bw[0] /= bw_count[0];
+
+  PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
+  PRINT("# Avg bus bandwidth    : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
+  PRINT("#\n");
+#ifdef MPI_SUPPORT
+  MPI_Finalize();
+#endif
+
+  PRINT("%s\n", ncclGetLastError(NULL));
+
+  // 'cuda-memcheck --leak-check full' requires this
+  cudaDeviceReset();
+
+  if (errors[0] || bw[0] < check_avg_bw*(0.9))
+    exit(EXIT_FAILURE);
+  else
+    exit(EXIT_SUCCESS);
+}
diff --git a/nccl-tests/src/common.h b/nccl-tests/src/common.h
new file mode 100644
index 0000000..20fa461
--- /dev/null
+++ b/nccl-tests/src/common.h
@@ -0,0 +1,286 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "nccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+#include "timer.h"
+
+// For nccl.h < 2.13 since we define a weak fallback
+extern "C" char const* ncclGetLastError(ncclComm_t comm);
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d "           \
+           "'%s / %s'\n",                           \
+           hostname,__FILE__,__LINE__,              \
+           ncclGetErrorString(res),                 \
+           ncclGetLastError(NULL));                 \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+#else
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+#endif
+
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+  testTimeout = 4,
+  testNumResults = 5
+} testResult_t;
+
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+
+extern struct testEngine ncclTestEngine;
+
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+
+  int totalProcs;
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int* gpus;
+  int localRank;
+  void** sendbuffs;
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void** recvbuffs;
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  cudaStream_t* streams;
+
+  void** expected;
+  size_t expectedBytes;
+  int* errors;
+  double* bw;
+  int* bw_count;
+
+  int reportErrors;
+
+  struct testColl* collTest;
+};
+
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+
+#include <unistd.h>
+
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+
+#include <stdint.h>
+
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64:
+      return 8;
+    default: return 0;
+  }
+}
+
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+
+extern int is_main_proc;
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+
+#endif
diff --git a/nccl-tests/src/gather.cu b/nccl-tests/src/gather.cu
new file mode 100644
index 0000000..03ef4d9
--- /dev/null
+++ b/nccl-tests/src/gather.cu
@@ -0,0 +1,116 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count/nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = count/nranks;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks;
+}
+
+testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    if (rank == root) {
+      TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl gatherTest = {
+  "Gather",
+  GatherGetCollByteCount,
+  GatherInitData,
+  GatherGetBw,
+  GatherRunColl
+};
+
+void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &gatherTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine gatherEngine = {
+  GatherGetBuffSize,
+  GatherRunTest
+};
+
+#pragma weak ncclTestEngine=gatherEngine
diff --git a/nccl-tests/src/hypercube.cu b/nccl-tests/src/hypercube.cu
new file mode 100644
index 0000000..5c1456f
--- /dev/null
+++ b/nccl-tests/src/hypercube.cu
@@ -0,0 +1,118 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+#define ALIGN 4
+
+void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
+  *recvInplaceOffset = 0;
+  *paramcount = base;
+}
+
+testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+    for (int j=0; j<nranks; j++) {
+      TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  char* sbuff = (char*)sendbuff;
+  char* rbuff = (char*)recvbuff;
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankSize = count * wordSize(type);
+
+  if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
+
+  // Hypercube AllGather
+  for (int mask=1; mask<nRanks; mask<<=1) {
+    NCCLCHECK(ncclGroupStart());
+    int s = rank & ~(mask-1);
+    int r = s ^ mask;
+    NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclGroupEnd());
+  }
+  return testSuccess;
+}
+
+struct testColl hyperCubeTest = {
+  "HyperCube",
+  HyperCubeGetCollByteCount,
+  HyperCubeInitData,
+  HyperCubeGetBw,
+  HyperCubeRunColl
+};
+
+void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &hyperCubeTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  // Check if this is a power of 2
+  int nRanks = args->nProcs*args->nThreads*args->nGpus;
+  if (nRanks && !(nRanks & (nRanks - 1))) {
+    for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+    }
+  } else {
+    printf("nRanks %d is not a power of 2, skipping\n", nRanks);
+  }
+
+  return testSuccess;
+}
+
+struct testEngine hyperCubeEngine = {
+  HyperCubeGetBuffSize,
+  HyperCubeRunTest
+};
+
+#pragma weak ncclTestEngine=hyperCubeEngine
diff --git a/nccl-tests/src/nccl1_compat.h b/nccl-tests/src/nccl1_compat.h
new file mode 100644
index 0000000..020a4bc
--- /dev/null
+++ b/nccl-tests/src/nccl1_compat.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
diff --git a/nccl-tests/src/reduce.cu b/nccl-tests/src/reduce.cu
new file mode 100644
index 0000000..f2fa80d
--- /dev/null
+++ b/nccl-tests/src/reduce.cu
@@ -0,0 +1,110 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  *busBw = baseBw;
+}
+
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
+  return testSuccess;
+}
+
+struct testColl reduceTest = {
+  "Reduce",
+  ReduceGetCollByteCount,
+  ReduceInitData,
+  ReduceGetBw,
+  ReduceRunColl
+};
+
+void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      for (int k=begin_root; k<=end_root; k++) {
+        TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
+      }
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine reduceEngine = {
+  ReduceGetBuffSize,
+  ReduceRunTest
+};
+
+#pragma weak ncclTestEngine=reduceEngine
diff --git a/nccl-tests/src/reduce_scatter.cu b/nccl-tests/src/reduce_scatter.cu
new file mode 100644
index 0000000..ed372e3
--- /dev/null
+++ b/nccl-tests/src/reduce_scatter.cu
@@ -0,0 +1,105 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+#define ALIGN 4
+
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base*nranks;
+  *recvcount = base;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = base;
+  *paramcount = base;
+}
+
+testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
+    TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+
+struct testColl reduceScatterTest = {
+  "ReduceScatter",
+  ReduceScatterGetCollByteCount,
+  ReduceScatterInitData,
+  ReduceScatterGetBw,
+  ReduceScatterRunColl
+};
+
+void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceScatterTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    run_ops = &op;
+    run_opnames = &opName;
+    op_count = 1;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine reduceScatterEngine = {
+  ReduceScatterGetBuffSize,
+  ReduceScatterRunTest
+};
+
+#pragma weak ncclTestEngine=reduceScatterEngine
diff --git a/nccl-tests/src/scatter.cu b/nccl-tests/src/scatter.cu
new file mode 100644
index 0000000..49d20e1
--- /dev/null
+++ b/nccl-tests/src/scatter.cu
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = count/nranks;
+}
+
+testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+    TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl scatterTest = {
+  "Scatter",
+  ScatterGetCollByteCount,
+  ScatterInitData,
+  ScatterGetBw,
+  ScatterRunColl
+};
+
+void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &scatterTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine scatterEngine = {
+  ScatterGetBuffSize,
+  ScatterRunTest
+};
+
+#pragma weak ncclTestEngine=scatterEngine
diff --git a/nccl-tests/src/sendrecv.cu b/nccl-tests/src/sendrecv.cu
new file mode 100644
index 0000000..c9eb5bb
--- /dev/null
+++ b/nccl-tests/src/sendrecv.cu
@@ -0,0 +1,114 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+    int peer = (rank-1+nranks)%nranks;
+    TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place sendrecv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  int recvPeer = (rank-1+nRanks) % nRanks;
+  int sendPeer = (rank+1) % nRanks;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
+  NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+struct testColl sendRecvTest = {
+  "SendRecv",
+  SendRecvGetCollByteCount,
+  SendRecvInitData,
+  SendRecvGetBw,
+  SendRecvRunColl
+};
+
+void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &sendRecvTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine sendRecvEngine = {
+  SendRecvGetBuffSize,
+  SendRecvRunTest
+};
+
+#pragma weak ncclTestEngine=sendRecvEngine
diff --git a/nccl-tests/src/timer.cc b/nccl-tests/src/timer.cc
new file mode 100644
index 0000000..f65be4d
--- /dev/null
+++ b/nccl-tests/src/timer.cc
@@ -0,0 +1,28 @@
+#include "timer.h"
+
+// Make sure to compile this translation unit with the host compiler and not
+// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
+#include <chrono>
+
+namespace {
+  std::uint64_t now() {
+    using clock = std::chrono::steady_clock;
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
+  }
+}
+
+timer::timer() {
+  t0 = now();
+}
+
+double timer::elapsed() const {
+  std::uint64_t t1 = now();
+  return 1.e-9*(t1 - t0);
+}
+
+double timer::reset() {
+  std::uint64_t t1 = now();
+  double ans = 1.e-9*(t1 - t0);
+  t0 = t1;
+  return ans;
+}
diff --git a/nccl-tests/src/timer.h b/nccl-tests/src/timer.h
new file mode 100644
index 0000000..33aed4d
--- /dev/null
+++ b/nccl-tests/src/timer.h
@@ -0,0 +1,15 @@
+#ifndef _408319ecdd5b47b28bf8f511c4fdf816
+#define _408319ecdd5b47b28bf8f511c4fdf816
+
+#include <cstdint>
+
+// Can't include <chrono> because of bug with gcc 10.3.0
+class timer {
+  std::uint64_t t0;
+public:
+  timer();
+  double elapsed() const;
+  double reset();
+};
+
+#endif
diff --git a/nccl-tests/verifiable/Makefile b/nccl-tests/verifiable/Makefile
new file mode 100644
index 0000000..b141a2a
--- /dev/null
+++ b/nccl-tests/verifiable/Makefile
@@ -0,0 +1,24 @@
+include ../../makefiles/common.mk
+
+.PHONY: all clean
+
+BUILDDIR := $(abspath ../../build)
+NCCLDIR := $(BUILDDIR)
+NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
+DST_DIR := $(BUILDDIR)/test/verifiable
+
+all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
+
+clean:
+	rm -rf $(DST_DIR)
+
+TEST_VERIFIABLE_SRCDIR := .
+TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
+include verifiable.mk
+
+self_test: $(DST_DIR)/self_test
+
+$(DST_DIR)/self_test: verifiable.cu verifiable.h
+	@printf "Linking  %s\n" $@
+	@mkdir -p $(DST_DIR)
+	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
diff --git a/nccl-tests/verifiable/inexact_regress.cu b/nccl-tests/verifiable/inexact_regress.cu
new file mode 100644
index 0000000..d7bd545
--- /dev/null
+++ b/nccl-tests/verifiable/inexact_regress.cu
@@ -0,0 +1,177 @@
+/* Generate parameters for our error bound model of floating point average
+ * (sum of scaled values) by sampling sums of random sequences for each
+ * floating point type.
+ *
+ * The model has parameters "coef" and "power", where for two floats a & b,
+ * they are close enough if and only if:
+ *   abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
+ *
+ * Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
+ *
+ * Compile with:
+ *   nvcc -gencode=arch=compute_80,code=sm_80
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+using std::uint64_t;
+using std::uint32_t;
+using bfloat16 = __nv_bfloat16;
+
+template<typename T>
+struct float_traits;
+
+template<>
+struct float_traits<float> {
+  static constexpr int mantissa_bits = 23;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint32_t;
+  __device__ static float make(double x) { return (float)x; }
+  __device__ static float make(uint64_t x) { return (float)x; }
+  __device__ static double todouble(float x) { return x; }
+  __device__ static float add(float a, float b) { return a+b; }
+  __device__ static float mul(float a, float b) { return a*b; }
+};
+template<>
+struct float_traits<double> {
+  static constexpr int mantissa_bits = 52;
+  static constexpr int exponent_bits = 11;
+  using uint_t = uint64_t;
+  __device__ static double make(double x) { return x; }
+  __device__ static double make(uint64_t x) { return (double)x; }
+  __device__ static double todouble(double x) { return x; }
+  __device__ static double add(double a, double b) { return a+b; }
+  __device__ static double mul(double a, double b) { return a*b; }
+};
+template<>
+struct float_traits<half> {
+  static constexpr int mantissa_bits = 10;
+  static constexpr int exponent_bits = 5;
+  using uint_t = uint16_t;
+  __device__ static half make(double x) { return __double2half(x); }
+  __device__ static half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(half x) { return __half2float(x); }
+  __device__ static half add(half a, half b) { return __hadd(a, b); }
+  __device__ static half mul(half a, half b) { return __hmul(a, b); }
+};
+template<>
+struct float_traits<bfloat16> {
+  static constexpr int mantissa_bits = 7;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint16_t;
+  __device__ static bfloat16 make(double x) { return __double2bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); }
+  __device__ static double todouble(bfloat16 x) { return __bfloat162float(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); }
+};
+
+template<typename F>
+__device__ int compare(F a, F b) {
+  union { typename float_traits<F>::uint_t ua; F fa; };
+  union { typename float_traits<F>::uint_t ub; F fb; };
+  ua=0; ub=0;
+  fa=a; fb=b;
+  //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
+  return ua < ub ? ub-ua : ua-ub;
+}
+
+struct xoshiro256ss {
+	uint64_t s[4];
+  __device__ xoshiro256ss(int seed) {
+    constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
+    for(int i=0; i < 4; i++)
+      s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
+  }
+  __device__ uint64_t operator()() {
+    auto rol64 = [](uint64_t x, int k) {
+      return (x << k) | (x >> (64 - k));
+    };
+    uint64_t const result = rol64(s[1] * 5, 7) * 9;
+    uint64_t const t = s[1] << 17;
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+    s[2] ^= t;
+    s[3] = rol64(s[3], 45);
+    return result;
+  }
+};
+
+template<typename F>
+__global__ void kernel() {
+  using traits = float_traits<F>;
+  constexpr int samps = 4<<10;
+  __shared__ F accf[samps];
+  __shared__ double accd[samps];
+
+  xoshiro256ss rng(threadIdx.x);
+  float expo_avg = 1;
+  for(int pass=0; pass < 2; pass++) {
+    F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
+    int err_max = 0;
+    float coef = 0;
+    double expo_sum = 0;
+    int expo_n = 0;
+    int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
+    for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
+    //for(int round=0; round < 2; round++) {
+      for(int i=threadIdx.x; i < samps; i += blockDim.x) {
+        accf[i] = 0;
+        accd[i] = 0;
+      }
+      __syncthreads();
+      for(int r=0; r < max_ranks; r++) {
+        int err = 0;
+        for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
+          constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
+          double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
+          F f = traits::make(d);
+          accf[i] = traits::add(accf[i], traits::mul(scalar, f));
+          accd[i] += traits::todouble(f);
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
+          int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
+          err = err > e ? err : e;
+        }
+        err = __reduce_max_sync(-1u, err);
+        err_max = err_max > err ? err_max : err;
+        if (r >= 2) {
+          // err = 1 + coef*pow(r,expo)
+          float c = float(err-1)/powf(float(r), expo_avg);
+          coef = coef > c ? coef : c;
+        }
+        if (r >= 2) {
+          double expo = log2f(1+err_max)/log2f(r);
+          expo_sum += expo;
+          expo_n++;
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
+        }
+      }
+    }
+    if(pass==0)
+      expo_avg = expo_sum/expo_n;
+    else if(threadIdx.x == 0)
+      std::printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+  }
+}
+
+int main() {
+  std::printf("type=float:\n");
+  kernel<float><<<1,32>>>();
+  cudaDeviceSynchronize();
+
+  std::printf("\ntype=half:\n");
+  kernel<half><<<1,32>>>();
+  cudaDeviceSynchronize();
+
+  std::printf("\ntype=bfloat16:\n");
+  kernel<bfloat16><<<1,32>>>();
+  cudaDeviceSynchronize();
+  return 0;
+}
diff --git a/nccl-tests/verifiable/verifiable.cu b/nccl-tests/verifiable/verifiable.cu
new file mode 100644
index 0000000..5f617ee
--- /dev/null
+++ b/nccl-tests/verifiable/verifiable.cu
@@ -0,0 +1,1227 @@
+#pragma nv_diag_suppress declared_but_not_referenced
+
+#include "verifiable.h"
+#include <nccl.h>
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#if CUDART_VERSION >= 11000
+#include <cuda_bf16.h>
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__)
+  #define HAVE_ncclBfloat16 1
+#else
+  #define HAVE_ncclBfloat16 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define HAVE_ncclAvg 1
+#else
+  #define HAVE_ncclAvg 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define HAVE_ncclPreMulSum 1
+#else
+  #define HAVE_ncclPreMulSum 0
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <unistd.h>
+
+using std::size_t;
+using std::int8_t;
+using std::int16_t;
+using std::int32_t;
+using std::int64_t;
+using std::uint8_t;
+using std::uint16_t;
+using std::uint32_t;
+using std::uint64_t;
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+__device__ unsigned long long bitsOf(T x) {
+  union { unsigned long long ull; T val; } u;
+  u.ull = 0;
+  u.val = x;
+  return u.ull;
+}
+
+__host__ __device__ uint64_t mixBits(uint64_t x) {
+  union { uint32_t u32[2]; uint64_t u64; };
+  u64 = x;
+  u32[1] += 1;
+  u32[0] ^= u32[1];
+  u64 *= 0x9e3779b97f4a7c13u;
+  u32[0] ^= u32[1]<<16 ^ u32[1]>>16;
+  return u64;
+}
+
+__host__ __device__ uint64_t hashOf(uint64_t a, uint64_t b=0) {
+  a += uint64_t(1)<<32;
+  a += b;
+  a ^= a>>32;
+  a *= 0x9e3779b97f4a7c13u;
+  a += b>>16 ^ b<<48;
+  a ^= a>>32;
+  a *= 0xc4ceb9fe1a85ec53u;
+  return a;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct IsIntegral: std::is_integral<T> {};
+template<>
+struct IsIntegral<half>: std::false_type {};
+#ifdef __CUDA_BF16_TYPES_EXIST__
+template<>
+struct IsIntegral<__nv_bfloat16>: std::false_type {};
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Hide a value from arithmetic optimizations. Hopefully compiler cannot detect
+// that this is equivalent to the identity function.
+template<typename T>
+__host__ __device__ T inhibit(T x) {
+  union { uint64_t u64; T val; };
+  u64 = 0;
+  val = x;
+  u64 *= 0x0000000100000001u;
+  u64 *= 0xffffffff00000001u;
+  return val;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+  template<typename Y, typename X>
+  __host__ __device__ Y castTo(X x) {
+    return Y(x);
+  }
+  template<typename Y>
+  __host__ __device__ Y castTo(float x) {
+    return Y(x);
+  }
+  template<>
+  __host__ __device__ half castTo<half>(float x) {
+    return __float2half(x);
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  template<>
+  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
+    return __float2bfloat16(x);
+  }
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The reduction functions
+
+namespace {
+struct ReduceNil {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T /*b*/) const { return a; }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()+T())>
+  __host__ __device__ T operator()(T a, T b) const { return a + b; }
+  __host__ __device__ half operator()(half a, half b) const {
+    #if __CUDA_ARCH__ >= 530
+      return __hadd(a, b);
+    #else
+      return __float2half(__half2float(a) + __half2float(b));
+    #endif
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hadd(a, b);
+    #else
+      return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
+    #endif
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceProd {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()*T())>
+  __host__ __device__ T operator()(T a, T b) const { return a * b; }
+  __host__ __device__ half operator()(half a, half b) const {
+    #if __CUDA_ARCH__ >= 530
+      return __hmul(a, b);
+    #else
+      return __float2half(__half2float(a) * __half2float(b));
+    #endif
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmul(a, b);
+    #else
+      return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b));
+    #endif
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMin {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()<T())>
+  __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
+  __host__ __device__ half operator()(half a, half b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmin(a, b);
+    #elif __CUDA_ARCH__ >= 530
+      return __hlt(a, b) ? a : b;
+    #else
+      return __half2float(a) < __half2float(b) ? a : b;
+    #endif
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmin(a, b);
+    //#elif __CUDA_ARCH__ >= 530
+    //  return __hlt(a, b) ? a : b;
+    #else
+      return __bfloat162float(a) < __bfloat162float(b) ? a : b;
+    #endif
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMax {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()>T())>
+  __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
+  __host__ __device__ half operator()(half a, half b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmax(a, b);
+    #elif __CUDA_ARCH__ >= 530
+      return __hgt(a, b) ? a : b;
+    #else
+      return __half2float(a) > __half2float(b) ? a : b;
+    #endif
+  }
+  #ifdef __CUDA_BF16_TYPES_EXIST__
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
+    #if __CUDA_ARCH__ >= 800
+      return __hmax(a, b);
+    //#elif __CUDA_ARCH__ >= 530
+    //  return __hgt(a, b) ? a : b;
+    #else
+      return __bfloat162float(a) > __bfloat162float(b) ? a : b;
+    #endif
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReducePreMulSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceProd()(x, ncclVerifiablePremulScalar<T>(rank_me));
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+template<typename T, bool integral = IsIntegral<T>::value>
+struct ReduceAvg_Base;
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/true> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x/rank_n; }
+};
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/false> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const {
+    using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+    return ReduceProd()(inhibit(castTo<T>(T1(1)/T1(rank_n))), inhibit(x));
+  }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+struct ReduceAvg {
+  int rank_n;
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceAvg_Base<T>{rank_n}.preOp(x, rank_me);
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const {
+    return ReduceAvg_Base<T>{rank_n}(a, b);
+  }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const {
+    return ReduceAvg_Base<T>{rank_n}.postOp(x);
+  }
+};
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct FloatLayout;
+template<>
+struct FloatLayout<float> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 23;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<double> {
+  static constexpr int exponent_bits = 11, mantissa_bits = 52;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<half> {
+  static constexpr int exponent_bits = 5, mantissa_bits = 10;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#ifdef __CUDA_BF16_TYPES_EXIST__
+template<>
+struct FloatLayout<__nv_bfloat16> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 7;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#endif
+
+template<typename T>
+__host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
+  union { T ans; uint64_t bits; };
+  bits = sign;
+  bits <<= FloatLayout<T>::exponent_bits;
+  bits |= exp;
+  bits <<= FloatLayout<T>::mantissa_bits;
+  bits |= mant;
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// High bits of multiplcation are useful for generating bounded random values
+// from unbounded random values. For instance, given X a totally random 32-bit
+// integer, `umul32hi(X,n)` will be totally random within [0,n).
+__host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
+#ifdef __CUDA_ARCH__
+  return __umulhi(a, b);
+#else
+  return uint64_t(a)*b >> 32;
+#endif
+}
+__host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
+#ifdef __CUDA_ARCH__
+  return __umul64hi(a, b);
+#else
+  return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
+#endif
+}
+
+__host__ __device__ int clz32(int x) {
+#ifdef __CUDA_ARCH__
+  return __clz(x);
+#else
+  return x==0 ? 32 : __builtin_clz(x);
+#endif
+}
+__host__ __device__ int clz64(long long x) {
+#ifdef __CUDA_ARCH__
+  return __clzll(x);
+#else
+  return x==0 ? 64 : __builtin_clzll(x);
+#endif
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// Returns a wildly permuted rank index. Useful when we know we want exactly N
+// random ranks to exhibit some behavior, we can just test if:
+// `shuffleRank(rank_n, rank_me, rng) < N`. Note that rank_n > 0 must be true
+// for well defined results. This mixes the bits of rng.
+__host__ __device__ int shuffleRank(int rank_n, int rank_me, uint64_t &rng) {
+  uint32_t a = uint32_t(rng);
+  uint32_t b = uint32_t(rng>>32);
+  rng = mixBits(rng);
+
+  uint32_t r = rank_me;
+  // round down rank_n to largest pow2, then subtract 1
+  uint32_t n2 = (~uint32_t(0)>>1) >> clz32(rank_n);
+
+  // These are 1:1 functions modulo 2^n:
+  //   f(x) = x*a + b : for odd a, any b
+  //   f(x) = (x*x + x)/2
+  // So we apply both to the bottom n2+1 ranks, then rotate the top
+  // (rank_n-n2-1) to the bottom and apply both again.
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 ranks
+    r = (r*(a|1) + b) & n2;
+    r = (r*r + r)/2 & n2;
+    // rotate top to bottom
+    r += rank_n - (n2+1);
+  }
+  else
+    r -= n2+1; // rotate top to bottom
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 again
+    r = (r*(b|1) + a) & n2;
+    r = (r*r + r)/2 & n2;
+  }
+  return r;
+}
+}
+
+namespace {
+// Generate wild integers x and y such that if every rank submits its x into a
+// summation the result will be y with y <= y_max. Ranks should be shuffled
+// before calling.
+template<typename Uint>
+__host__ __device__ void genSumXY(
+    int rank_n, int rank_me, uint64_t &rng, Uint y_max, Uint &x, Uint &y,
+    bool avoid_y=false // if true then returned y will not equal given y
+  ) {
+  static_assert(std::is_unsigned<Uint>::value, "Type must be unsigned integral.");
+
+  { // Pick y as a random value in [y_max/2, y_max]
+    Uint d, y_min = (y_max+1)/2;
+    if(8*sizeof(Uint) > 32)
+      d = umul64hi(rng, y_max/2 + (avoid_y ? 0 : 1));
+    else
+      d = umul32hi(uint32_t(rng), y_max/2 + (avoid_y ? 0 : 1));
+    Uint y1 = (avoid_y ? y+1 : y_min) + d;
+    y = y1 - (avoid_y && (y1 < y_min || y_max < y1) ? y_max/2 : 0);
+  }
+  rng = mixBits(rng);
+
+  unsigned r = unsigned(rank_me);
+  unsigned rn = unsigned(rank_n);
+  // Partition our rn ranks into pn distinct subsets each of size rn/pn. If each
+  // rank submits 1+p (where p is 0-based partition index) then the sum be:
+  //   (rn/pn) * pn*(pn+1)/2
+  // So set this equal to our desired sum y and solve for pn.
+  //   (rn/pn) * pn*(pn+1)/2 = y
+  //   rn*(pn+1)/2 = y
+  //   pn = 2*(y/rn)-1
+  Uint pn = rn == 1 ? 1 : 2*(y/rn) - 1;
+  // In the case where rn is huge (compared to y) use only one partition meaning
+  // that all rn ranks will submit 1 (since p=0).
+  pn = pn == 0 ? 1 : pn;
+  // Can't have more partitions than ranks.
+  pn = rn < pn ? rn : pn;
+  // Compute sum of contribution from pn partitions where each submits p+1.
+  Uint p_sum;
+  if(y_max <= ~uint32_t(0)>>1) // compile time known
+    p_sum = Uint(uint32_t(pn)*uint32_t(pn+1)/2);
+  else
+    p_sum = Uint(uint64_t(pn)*uint64_t(pn+1)/2);
+  // Let s be the number of ranks per partition. This is either rn/pn as we
+  // intended, or y/p_sum if that's smaller to prevent overshooting our target y.
+  uint32_t s = y/p_sum < rn/pn ? y/p_sum : rn/pn;
+  x = r/s < pn ? 1 + r/s : 0; //  First s*pn ranks contribute partition index +1.
+  x += r == rn-1 ? y - s*p_sum : 0; // Last rank contributes discrepancy.
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    bool same_sign
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int x_sign = y_sign;
+  int xy_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+
+  // If we're using mixed signs then partition into evens and odds.
+  int subrank_n = same_sign ? rank_n : (rank_n+1)/2;
+  int subrank_me = same_sign ? rank_me : rank_me/2;
+  uintmant_t x0_mant, y0_mant;
+  genSumXY(subrank_n, subrank_me, rng, max_mant, x0_mant, y0_mant);
+
+  if (!same_sign && (rank_n+0)/2 != 0) {
+    uintmant_t x1_mant, y1_mant = y0_mant;
+    // Avoid generating y1_mant == y0_mant so we don't have to worry about
+    // signed zero as the result.
+    genSumXY((rank_n+0)/2, rank_me/2, rng, max_mant, x1_mant, y1_mant, /*avoid_y=*/true);
+    y_sign ^= y0_mant < y1_mant ? 1 : 0;
+    y0_mant = (y0_mant < y1_mant ? -1 : 1)*(y0_mant - y1_mant);
+    x_sign ^= rank_me%2;
+    x0_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  }
+
+  uintmant_t ans_mant = input_not_output ? x0_mant : y0_mant;
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = input_not_output ? x_sign : y_sign;
+    int ans_exp = xy_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatPreMulSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int y_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  int subrank_me0 = shuffleRank((rank_n+1)/2, rank_me/2, rng);
+  int subrank_me1 = shuffleRank((rank_n+0)/2, rank_me/2, rng);
+
+  // when ncclVerifiablePremulScalar() = 1.0 (rank_me%2 == 0)
+  uintmant_t x0_mant, y0_mant;
+  genSumXY((rank_n+1)/2, subrank_me0, rng, max_mant>>1, x0_mant, y0_mant);
+
+  // when ncclVerifiablePremulScalar() = 2.0 (rank_me%2 == 1)
+  uintmant_t x1_mant=0, y1_mant=0;
+  if((rank_n+0)/2 != 0)
+    genSumXY((rank_n+0)/2, subrank_me1, rng, max_mant>>2, x1_mant, y1_mant);
+
+  uintmant_t x_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  uintmant_t y_mant = y0_mant + 2*y1_mant;
+  uintmant_t ans_mant = input_not_output ? x_mant : y_mant;
+
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = y_sign;
+    int ans_exp = y_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatProd(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  // Three kinds of contributions (values for x):
+  // 1) x = random value: only one rank does this
+  // 2) x = 2^n: random positive n
+  // 3) x = 1
+  // Since only one rank submits a random value, the result of the product
+  // will have the same mantissa as that value but with an exponent incorporating
+  // the sum of the exponents from case (2)
+
+  uint64_t rng = hashOf(seed, index);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+  int y_sign = (rank_n/2)%2;
+  int x_sign = rank_me%2;
+
+  constexpr unsigned max_exp = -1 + (1<<(FloatLayout<T>::exponent_bits-1));
+  unsigned x_exp=0, y_exp=0;
+  genSumXY(rank_n, rank_me, rng, max_exp, x_exp, y_exp);
+  x_exp += FloatLayout<T>::exponent_bias;
+  y_exp += FloatLayout<T>::exponent_bias;
+
+  constexpr uint64_t mant_mask = (uint64_t(1)<<FloatLayout<T>::mantissa_bits)-1;
+  uint64_t y_mant = rng & mant_mask;
+  if (y_mant == 0) y_mant = 1;
+
+  return makeFloat<T>(
+    input_not_output ? x_sign : y_sign,
+    input_not_output ? x_exp : y_exp,
+    !input_not_output || rank_me==0 ? y_mant : 0
+  );
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// What follows is lots of overloads for genInput/genOutput to generate data
+
+namespace {
+// General case for integral data for all ops but ReduceNil/premulsum
+template<typename T, typename ReduceFn,
+         typename = typename std::enable_if<
+             !std::is_same<ReduceFn, ReduceNil>::value
+           >::type>
+__host__ __device__ void genInput(
+    T &ans, ReduceFn, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = uint64_t(-1)>>(64 - 8*sizeof(T));
+  bits &= hashOf(index ^ index<<16 ^ rank_me, seed);
+  // make sure we never return 0 in products
+  ans = std::is_same<ReduceFn, ReduceProd>::value && bits == 0 ? T(1) : tmp;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Dumb/generic case for genOutput just reduces results of genInput
+
+namespace {
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceFn op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  T acc = genInput<T>(op, rank_n, 0, seed, index);
+  acc = op.preOp(acc, 0);
+  for(int r=1; r < rank_n; r++)
+    acc = op(acc, op.preOp(genInput<T>(op, rank_n, r, seed, index), r));
+  ans = op.postOp(acc);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Nil reduction (byte copy functions). Optimized to assume rank_n=1
+
+namespace {
+template<typename T, bool IsIntegral>
+__host__ __device__ void genInput(
+    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  (void)rank_n, (void)rank_me; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = mixBits(seed ^ index);
+  bits >>= 64 - 8*sizeof(T);
+  bits &= uint64_t(-1)>>(64 - 8*sizeof(T));
+  ans = tmp;
+}
+
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  ans = genInput<T>(op, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Sum of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/false);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/false);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Product of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceProd, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceProd, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// PreMulSum of int/float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type integral
+  ) {
+  genInput(ans, ReduceSum(), rank_n, rank_me, seed, index, integral);
+}
+
+// No genOutput overload specific to premulsum(int), just use generic case.
+
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReducePreMulSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// Average of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
+  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+  ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// min/max of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMin, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type integral
+  ) {
+  genInput<T>(ans, ReduceMax(), rank_n, rank_me, seed, index, integral);
+}
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMax, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
+  int sign = rng & 1;
+  rng ^= rng>>1;
+  int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
+  exp += 1<<(FloatLayout<T>::exponent_bits-2);
+  rng ^= rng >> FloatLayout<T>::exponent_bits;
+  uint64_t mant = rng & mant_mask;
+  ans = makeFloat<T>(sign, exp, mant);
+}
+
+// No genOutput overload specific to floating point min/max, just use generic case.
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Entry API for genInput/genOutput
+
+namespace {
+template<typename T, typename ReduceFn>
+__host__ __device__ T genInput(
+    ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genInput(ans, op, rank_n, rank_me, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+
+template<typename T, typename ReduceFn>
+__host__ __device__ T genOutput(
+    ReduceFn op, int rank_n, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genOutput(ans, op, rank_n, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareInput2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genInput<T>(op, rank_n, rank_me, seed, elt_ix0+i);
+    #if 0
+    T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
+      std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareInput1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareInput(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
+    else \
+      prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareExpected2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    #if 0
+    printf("prepareExpected2 seed=0x%llx ix=%lld x=%g elts=%p\n",
+      (long long)seed, (long long)(elt_ix0+i), (float)elts[i], elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareExpected1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareExpected(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
+    else \
+      prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+/* How we compare floating point values when exactness is impossible is interesting.
+ * First, we take note that simply reinterpreting integer bits as floating point
+ * gives us a monotonic mapping which exponentially spaces out floats. Thus
+ * consecutive integers encode consecutive floats. In general, using integer
+ * subraction on the bitpatterns of two floats gives us an integer which is the
+ * logarithm of their relative difference. But, if the floats always have similar
+ * exponents, than the integer difference is actually proportional to the
+ * relative error (this is because we are counting hops in the mantissa bits only,
+ * not the exponent bits). So a cheap way to compare if two floats are relatively
+ * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
+ * calculates such a tolerance for a summation of n floats. This formula
+ * was derived by inspecting the maximum observed integer difference over many
+ * random runs of summation. The parameter values were computed by the
+ * companion program "inexact_regress.cu".
+ */
+__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
+  float power, coef;
+  switch(elt_ty) {
+  case ncclFloat32:
+  case ncclFloat64:
+    power = .51f;
+    coef = 1.25f;
+    break;
+  case ncclFloat16:
+    power = .91f;
+    coef = .75f;
+    break;
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16:
+    power = .91f;
+    coef = .66f;
+    break;
+  #endif
+  }
+  #if __CUDA_ARCH__
+    return 1 + unsigned(coef*powf(float(rank_n), power));
+  #else
+    return 1 + unsigned(coef*std::pow(float(rank_n), power));
+  #endif
+}
+
+template<typename T>
+__host__ __device__  uint64_t calcDelta(T a, T b) {
+  union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
+  x.t = a;
+  y.t = b;
+  switch(sizeof(T)) {
+  case 1:  return x.i1 < y.i1 ? y.i1 - x.i1 : x.i1 - y.i1;
+  case 2:  return x.i2 < y.i2 ? y.i2 - x.i2 : x.i2 - y.i2;
+  case 4:  return x.i4 < y.i4 ? y.i4 - x.i4 : x.i4 - y.i4;
+  default: return x.i8 < y.i8 ? y.i8 - x.i8 : x.i8 - y.i8;
+  }
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T>
+__global__ void verifyPrepared(
+    T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    T a = results[i], b = expected[i];
+    T delta = a < b ? b - a : a - b;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      if(tolerance < delta) {
+        printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+}
+
+template<typename T, typename Uint, typename ReduceFn>
+__global__ void verifyInline2(
+    T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
+    intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    union { T t; Uint u; } a, b;
+    a.t = results[i];
+    b.t = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    Uint delta = a.u < b.u ? b.u - a.u : a.u - b.u;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      T input = genInput<T>(op, rank_n, 0, seed, elt_ix0+i);
+      if(tolerance < delta) {
+        printf("verifyInline2 fail T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      } else {
+        printf("verifyInline2 pass T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+}
+
+template<typename T, typename Uint>
+void verifyInline1(
+    T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    else \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+}
+
+void ncclVerifiableVerify(
+    void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+    int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    int64_t *bad_elt_n, cudaStream_t stream
+  ) {
+  bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
+  #if HAVE_ncclBfloat16
+    floating |= elt_ty == ncclBfloat16;
+  #endif
+
+  unsigned tolerance = 0;
+  #if HAVE_ncclAvg
+  if (floating && red_op == ncclAvg)
+    tolerance = calcSumFloatTolerance(rank_n, elt_ty);
+  #endif
+
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+
+  *bad_elt_n = 0;
+  #define CASE_TY(T, Uint) { \
+      if(expected != nullptr) { \
+        verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
+      } else { \
+        verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
+      } \
+    } break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t, uint8_t)
+  case ncclUint8: CASE_TY(uint8_t, uint8_t)
+  case ncclInt32: CASE_TY(int32_t, uint32_t)
+  case ncclUint32: CASE_TY(uint32_t, uint32_t)
+  case ncclInt64: CASE_TY(int64_t, uint64_t)
+  case ncclUint64: CASE_TY(uint64_t, uint64_t)
+  case ncclFloat16: CASE_TY(half, uint16_t)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
+  #endif
+  case ncclFloat32: CASE_TY(float, uint32_t)
+  case ncclFloat64: CASE_TY(double, uint64_t)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if SELF_TEST
+#include <iostream>
+
+template<typename T, typename Op>
+__device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
+  //if(!std::is_same<T,half>::value) return;
+  //if(!std::is_same<Op,ReduceProd>::value) return;
+  //if(rank_n!=3) return;
+
+  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
+  uint64_t seed = 0xc8e2bed69766d533;
+
+  for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
+    //if(ix!=387) continue;
+    T y = genOutput<T>(op, rank_n, seed, ix);
+    T sum;
+    for(int r=0; r < rank_n; r++) {
+      T x = genInput<T>(op, rank_n, r, seed, ix);
+      x = op.preOp(x, r);
+      sum = r==0 ? x : op(sum, inhibit(x));
+      //std::printf("x = %llx, sum = %llx\n", bitsOf(x), bitsOf(sum));
+    }
+    sum = op.postOp(sum);
+    if(tolerance < calcDelta(sum, y)) {
+      std::printf(
+        //"%10g != %10g  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        "%llx != %llx  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
+      );
+    }
+  }
+}
+
+template<typename T>
+__device__ void sweep1(int ty, char const *tyname) {
+  for(int i=0; i < 10; i++) {
+    int rank_n = (1<<i) + i;
+    sweep2<T>(ty, tyname, ReduceSum(), "sum", rank_n);
+    sweep2<T>(ty, tyname, ReduceProd(), "prod", rank_n);
+    sweep2<T>(ty, tyname, ReduceMin(), "min", rank_n);
+    sweep2<T>(ty, tyname, ReduceMax(), "max", rank_n);
+    sweep2<T>(ty, tyname, ReducePreMulSum(), "premulsum", rank_n);
+    sweep2<T>(ty, tyname, ReduceAvg{rank_n}, "avg", rank_n);
+  }
+}
+
+__global__ void sweep() {
+  sweep1<int8_t>(ncclInt8, "int8");
+  sweep1<uint8_t>(ncclUint8, "uint8");
+  sweep1<int32_t>(ncclInt32, "int32");
+  sweep1<uint32_t>(ncclUint32, "uint32");
+  sweep1<int64_t>(ncclInt64, "int64");
+  sweep1<uint64_t>(ncclUint64, "uint64");
+  sweep1<half>(ncclFloat16, "half");
+  #if HAVE_ncclBfloat16
+    sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
+  #endif
+  sweep1<float>(ncclFloat32, "float");
+  sweep1<double>(ncclFloat64, "double");
+}
+
+int main(int arg_n, char **args) {
+  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
+  cudaSetDevice(0);
+  sweep<<<1,512>>>();
+  cudaDeviceSynchronize();
+  return 0;
+}
+#endif
diff --git a/nccl-tests/verifiable/verifiable.h b/nccl-tests/verifiable/verifiable.h
new file mode 100644
index 0000000..aca0565
--- /dev/null
+++ b/nccl-tests/verifiable/verifiable.h
@@ -0,0 +1,59 @@
+#ifndef _d41d8cd98f00b204e9800998ecf8427e
+#define _d41d8cd98f00b204e9800998ecf8427e
+
+#include <cuda_runtime.h>
+
+#include <stdint.h>
+
+/* Routines for launching kernels that verify reduction results. A significant
+ * feature of these routines is they carefully craft floating point input
+ * to produce exactly predictable output.
+ *
+ * int elt_ty: actually just a ncclDataType_t
+ *
+ * int red_op: mostly just a  ncclRedOp_t. Since PreMulSum ops are dynamically
+ * created, these are encoded as the value ncclNumOps and their scalar is
+ * assumed to be `ncclVerifiablePremulScalar(rank_me)`
+ *
+ * uint64_t seed: arbitrary 64-bits to use in seeding the random values
+ *
+ * intptr_t elt_ix0: index of first element pointed to by elts when generating
+ * random values. This makes it possible to generate subsequences independently
+ * as well as in aggregate.
+ *
+ * int rank_n: Number of contributions into the reduction. Non-reduction
+ * collectives like broadcast, gather, etc will always set this to one.
+ *
+ * int rank_me: Index of this contribution
+ */
+
+// Use this as the local scalar for PreMulSum ops
+template<typename T>
+__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
+  return T(rank_me%2 == 0 ? 1.0f : 2.0f);
+}
+
+// Enqueue kernel to generate data which is to be reduced.
+void ncclVerifiablePrepareInput(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+);
+
+// Enqueue kernel to generate expected results of reduction.
+void ncclVerifiablePrepareExpected(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+);
+
+// Enqueue kernel to verify reduced data matches expectation. The number of
+// failed elements is written to bad_elt_n which must be in cudaHost memory.
+// If `expected == nullptr` then the expected results are generated on-the-fly
+// which can be costly. Thus if you plan to run the same reduction multiple
+// times it is advantageous to precompute the expected values with
+// ncclVerifiablePrepareExpected and pass them as `expected` here.
+void ncclVerifiableVerify(
+  void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+  int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+  int64_t *bad_elt_n, cudaStream_t stream
+);
+#endif
diff --git a/nccl-tests/verifiable/verifiable.mk b/nccl-tests/verifiable/verifiable.mk
new file mode 100644
index 0000000..225c32a
--- /dev/null
+++ b/nccl-tests/verifiable/verifiable.mk
@@ -0,0 +1,11 @@
+# We requires both of the following paths to be set upon including this makefile
+# TEST_VERIFIABLE_SRCDIR = <points to this directory>
+# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
+
+TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
+	@printf "Compiling %s\n" $@
+	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
+	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
diff --git a/sources.list b/sources.list
new file mode 100644
index 0000000..ee54ac3
--- /dev/null
+++ b/sources.list
@@ -0,0 +1,14 @@
+# 默认注释了源码镜像以提高 apt update 速度，如有需要可自行取消注释
+deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy main restricted universe multiverse
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy main restricted universe multiverse
+deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-updates main restricted universe multiverse
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-updates main restricted universe multiverse
+deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-backports main restricted universe multiverse
+# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-backports main restricted universe multiverse
+
+deb http://security.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse
+# deb-src http://security.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse
+
+# 预发布软件源，不建议启用
+# deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-proposed main restricted universe multiverse
+# # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ jammy-proposed main restricted universe multiverse
\ No newline at end of file
diff --git a/sshd_config b/sshd_config
new file mode 100644
index 0000000..a7fc81d
--- /dev/null
+++ b/sshd_config
@@ -0,0 +1,121 @@
+# This is the sshd server system-wide configuration file.  See
+# sshd_config(5) for more information.
+
+# This sshd was compiled with PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games
+
+# The strategy used for options in the default sshd_config shipped with
+# OpenSSH is to specify options with their default value where
+# possible, but leave them commented.  Uncommented options override the
+# default value.
+
+Include /etc/ssh/sshd_config.d/*.conf
+
+Port 12345
+#AddressFamily any
+#ListenAddress 0.0.0.0
+#ListenAddress ::
+
+#HostKey /etc/ssh/ssh_host_rsa_key
+#HostKey /etc/ssh/ssh_host_ecdsa_key
+#HostKey /etc/ssh/ssh_host_ed25519_key
+
+# Ciphers and keying
+#RekeyLimit default none
+
+# Logging
+#SyslogFacility AUTH
+#LogLevel INFO
+
+# Authentication:
+
+#LoginGraceTime 2m
+PermitRootLogin yes
+#StrictModes yes
+#MaxAuthTries 6
+#MaxSessions 10
+
+PubkeyAuthentication yes
+
+# Expect .ssh/authorized_keys2 to be disregarded by default in future.
+#AuthorizedKeysFile	.ssh/authorized_keys .ssh/authorized_keys2
+
+#AuthorizedPrincipalsFile none
+
+#AuthorizedKeysCommand none
+#AuthorizedKeysCommandUser nobody
+
+# For this to work you will also need host keys in /etc/ssh/ssh_known_hosts
+#HostbasedAuthentication no
+# Change to yes if you don't trust ~/.ssh/known_hosts for
+# HostbasedAuthentication
+#IgnoreUserKnownHosts no
+# Don't read the user's ~/.rhosts and ~/.shosts files
+#IgnoreRhosts yes
+
+# To disable tunneled clear text passwords, change to no here!
+#PasswordAuthentication yes
+#PermitEmptyPasswords no
+
+# Change to yes to enable challenge-response passwords (beware issues with
+# some PAM modules and threads)
+KbdInteractiveAuthentication no
+
+# Kerberos options
+#KerberosAuthentication no
+#KerberosOrLocalPasswd yes
+#KerberosTicketCleanup yes
+#KerberosGetAFSToken no
+
+# GSSAPI options
+#GSSAPIAuthentication no
+#GSSAPICleanupCredentials yes
+#GSSAPIStrictAcceptorCheck yes
+#GSSAPIKeyExchange no
+
+# Set this to 'yes' to enable PAM authentication, account processing,
+# and session processing. If this is enabled, PAM authentication will
+# be allowed through the KbdInteractiveAuthentication and
+# PasswordAuthentication.  Depending on your PAM configuration,
+# PAM authentication via KbdInteractiveAuthentication may bypass
+# the setting of "PermitRootLogin without-password".
+# If you just want the PAM account and session checks to run without
+# PAM authentication, then enable this but set PasswordAuthentication
+# and KbdInteractiveAuthentication to 'no'.
+UsePAM yes
+
+#AllowAgentForwarding yes
+#AllowTcpForwarding yes
+#GatewayPorts no
+X11Forwarding yes
+#X11DisplayOffset 10
+#X11UseLocalhost yes
+#PermitTTY yes
+PrintMotd no
+#PrintLastLog yes
+#TCPKeepAlive yes
+#PermitUserEnvironment no
+#Compression delayed
+#ClientAliveInterval 0
+#ClientAliveCountMax 3
+#UseDNS no
+#PidFile /run/sshd.pid
+#MaxStartups 10:30:100
+#PermitTunnel no
+#ChrootDirectory none
+#VersionAddendum none
+
+# no default banner path
+#Banner none
+
+# Allow client to pass locale environment variables
+AcceptEnv LANG LC_*
+
+# override default of no subsystems
+Subsystem sftp /usr/lib/openssh/sftp-server
+
+# Example of overriding settings on a per-user basis
+#Match User anoncvs
+#	X11Forwarding no
+#	AllowTcpForwarding no
+#	PermitTTY no
+#	ForceCommand cvs server
\ No newline at end of file
diff --git a/start.sh b/start.sh
new file mode 100644
index 0000000..1faf241
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+# Check if PORT is set, otherwise set default value
+if [ -z "${PORT:-}" ]; then
+    PORT=12345
+fi
+
+# Check if PASS is set, otherwise set default value
+if [ -z "${PASS:-}" ]; then
+    PASS=12345
+fi
+
+# change the sshd port
+sed -i "s/12345/$PORT/" /etc/ssh/sshd_config
+
+# change the root password
+echo "root:$PASS" | chpasswd
+
+# start sshd
+service ssh start
+
+tail -f /dev/null
\ No newline at end of file