From 42da22362c3303f9bef209e60108e5a643a1e5e0 Mon Sep 17 00:00:00 2001
From: Manoj S K <Manoj.SK@amd.com>
Date: Wed, 2 Aug 2023 06:53:03 +0000
Subject: [PATCH 1/7] transfer bench code integration to rvs

---
 CMakeLists.txt                                |    2 +-
 transferbench.so/CMakeLists.txt               |  182 ++
 transferbench.so/include/Compatibility.hpp    |   93 +
 transferbench.so/include/EnvVars.hpp          |  510 ++++
 .../include/GetClosestNumaNode.hpp            |  149 +
 transferbench.so/include/Kernels.hpp          |  409 +++
 transferbench.so/include/TransferBench.hpp    |  198 ++
 transferbench.so/include/action.h             |   86 +
 transferbench.so/include/rvs_module.h         |   31 +
 transferbench.so/include/tfb_worker.h         |   74 +
 transferbench.so/src/.gitignore               |    1 +
 transferbench.so/src/action.cpp               |  215 ++
 transferbench.so/src/hiptest_worker.cpp       |   96 +
 transferbench.so/src/rvs_module.cpp           |   91 +
 transferbench.so/src/tfb_worker.cpp           | 1953 +++++++++++++
 transferbench.so/tests.cmake                  |   27 +
 transferbench.so/transferbench/CHANGELOG.md   |  199 ++
 transferbench.so/transferbench/CMakeLists.txt |   31 +
 transferbench.so/transferbench/LICENSE.md     |   19 +
 transferbench.so/transferbench/README.md      |   63 +
 .../transferbench/docs/.doxygen/Doxyfile      | 2455 +++++++++++++++++
 .../transferbench/docs/.sphinx/_toc.yml.in    |   13 +
 .../docs/.sphinx/requirements.in              |    1 +
 .../docs/.sphinx/requirements.txt             |  155 ++
 transferbench.so/transferbench/docs/api.rst   |    5 +
 transferbench.so/transferbench/docs/conf.py   |   14 +
 .../docs/examples/configfile_format.rst       |   93 +
 .../transferbench/docs/examples/index.rst     |   11 +
 transferbench.so/transferbench/docs/index.rst |   11 +
 .../transferbench/docs/instructions.rst       |   36 +
 .../transferbench/examples/example.cfg        |   79 +
 31 files changed, 7301 insertions(+), 1 deletion(-)
 create mode 100644 transferbench.so/CMakeLists.txt
 create mode 100644 transferbench.so/include/Compatibility.hpp
 create mode 100644 transferbench.so/include/EnvVars.hpp
 create mode 100644 transferbench.so/include/GetClosestNumaNode.hpp
 create mode 100644 transferbench.so/include/Kernels.hpp
 create mode 100644 transferbench.so/include/TransferBench.hpp
 create mode 100644 transferbench.so/include/action.h
 create mode 100644 transferbench.so/include/rvs_module.h
 create mode 100644 transferbench.so/include/tfb_worker.h
 create mode 100644 transferbench.so/src/.gitignore
 create mode 100644 transferbench.so/src/action.cpp
 create mode 100644 transferbench.so/src/hiptest_worker.cpp
 create mode 100644 transferbench.so/src/rvs_module.cpp
 create mode 100644 transferbench.so/src/tfb_worker.cpp
 create mode 100644 transferbench.so/tests.cmake
 create mode 100644 transferbench.so/transferbench/CHANGELOG.md
 create mode 100644 transferbench.so/transferbench/CMakeLists.txt
 create mode 100644 transferbench.so/transferbench/LICENSE.md
 create mode 100644 transferbench.so/transferbench/README.md
 create mode 100644 transferbench.so/transferbench/docs/.doxygen/Doxyfile
 create mode 100644 transferbench.so/transferbench/docs/.sphinx/_toc.yml.in
 create mode 100644 transferbench.so/transferbench/docs/.sphinx/requirements.in
 create mode 100644 transferbench.so/transferbench/docs/.sphinx/requirements.txt
 create mode 100644 transferbench.so/transferbench/docs/api.rst
 create mode 100644 transferbench.so/transferbench/docs/conf.py
 create mode 100644 transferbench.so/transferbench/docs/examples/configfile_format.rst
 create mode 100644 transferbench.so/transferbench/docs/examples/index.rst
 create mode 100644 transferbench.so/transferbench/docs/index.rst
 create mode 100644 transferbench.so/transferbench/docs/instructions.rst
 create mode 100644 transferbench.so/transferbench/examples/example.cfg

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a750b23..e54fd195 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -474,7 +474,7 @@ add_subdirectory(rcqt.so)
 add_subdirectory(smqt.so)
 add_subdirectory(mem.so)
 add_subdirectory(babel.so)
-#add_subdirectory(edp.so)
+add_subdirectory(transferbench.so)
 add_subdirectory(perf.so)
 
 if (RVS_BUILD_TESTS)
diff --git a/transferbench.so/CMakeLists.txt b/transferbench.so/CMakeLists.txt
new file mode 100644
index 00000000..21165721
--- /dev/null
+++ b/transferbench.so/CMakeLists.txt
@@ -0,0 +1,182 @@
+################################################################################
+##
+## Copyright (c) 2018 ROCm Developer Tools
+##
+## MIT LICENSE:
+## Permission is hereby granted, free of charge, to any person obtaining a copy of
+## this software and associated documentation files (the "Software"), to deal in
+## the Software without restriction, including without limitation the rights to
+## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+## of the Software, and to permit persons to whom the Software is furnished to do
+## so, subject to the following conditions:
+##
+## The above copyright notice and this permission notice shall be included in all
+## copies or substantial portions of the Software.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+## SOFTWARE.
+##
+################################################################################
+
+cmake_minimum_required ( VERSION 3.5.0 )
+if ( ${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
+  message(FATAL "In-source build is not allowed")
+endif ()
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+
+set ( RVS "tfb" )
+set ( RVS_PACKAGE "rvs-roct" )
+set ( RVS_COMPONENT "lib${RVS}" )
+set ( RVS_TARGET "${RVS}" )
+
+project ( ${RVS_TARGET} )
+
+message(STATUS "MODULE: ${RVS}")
+add_compile_options(-std=c++11)
+add_compile_options(-Wall )
+if (RVS_COVERAGE)
+  add_compile_options(-o0 -fprofile-arcs -ftest-coverage)
+  set(CMAKE_EXE_LINKER_FLAGS "--coverage")
+  set(CMAKE_SHARED_LINKER_FLAGS "--coverage")
+endif()
+
+## Set default module path if not already set
+if ( NOT DEFINED CMAKE_MODULE_PATH )
+    set ( CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake_modules/" )
+endif ()
+
+## Include common cmake modules
+include ( utils )
+
+## Setup the package version.
+get_version ( "0.0.0" )
+
+set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} )
+set ( BUILD_VERSION_MINOR ${VERSION_MINOR} )
+set ( BUILD_VERSION_PATCH ${VERSION_PATCH} )
+set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
+
+if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" )
+    set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" )
+endif ()
+set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
+
+## make version numbers visible to C code
+add_compile_options(-DBUILD_VERSION_MAJOR=${VERSION_MAJOR})
+add_compile_options(-DBUILD_VERSION_MINOR=${VERSION_MINOR})
+add_compile_options(-DBUILD_VERSION_PATCH=${VERSION_PATCH})
+add_compile_options(-DLIB_VERSION_STRING="${LIB_VERSION_STRING}")
+add_compile_options(-DBUILD_VERSION_STRING="${BUILD_VERSION_STRING}")
+
+set(ROCBLAS_LIB "rocblas")
+set(HIP_HCC_LIB "amdhip64")
+
+# Determine HSA_PATH
+if(NOT DEFINED HIPCC_PATH)
+  if(NOT DEFINED ENV{HIPCC_PATH})
+    set(HIPCC_PATH "${ROCM_PATH}/hip" CACHE PATH "Path to which hipcc runtime has been installed")
+     else()
+       set(HIPCC_PATH $ENV{HIPCC_PATH} CACHE PATH "Path to which hipcc runtime has been installed")
+     endif()
+endif()
+
+# Determine HSA_PATH
+if(NOT DEFINED HSA_PATH)
+     if(NOT DEFINED ENV{HSA_PATH})
+	     set(HSA_PATH "${ROCM_PATH}/hsa" CACHE PATH "Path to which HSA runtime has been installed")
+     else()
+          set(HSA_PATH $ENV{HSA_PATH} CACHE PATH "Path to which HSA runtime has been installed")
+     endif()
+endif()
+
+# Add HIP_VERSION to CMAKE_<LANG>_FLAGS
+set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}")
+
+set(HIP_HCC_BUILD_FLAGS)
+set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include ${ASAN_CXX_FLAGS}")
+
+# Set compiler and compiler flags
+set(CMAKE_CXX_COMPILER "${HIPCC_PATH}/bin/hipcc")
+set(CMAKE_C_COMPILER   "${HIPCC_PATH}/bin/hipcc")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS} -L${ROCM_PATH}/lib")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
+
+if(BUILD_ADDRESS_SANITIZER)
+  execute_process(COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.asan-x86_64.so
+            OUTPUT_VARIABLE ASAN_LIB_FULL_PATH)
+  get_filename_component(ASAN_LIB_PATH ${ASAN_LIB_FULL_PATH} DIRECTORY)
+endif()
+
+# Determine Roc Runtime header files are accessible
+if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h)
+  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
+  RETURN()
+endif()
+
+if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h)
+  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
+  RETURN()
+endif()
+
+# Determine Roc Runtime header files are accessible
+if(DEFINED RVS_ROCMSMI)
+  if(NOT RVS_ROCMSMI EQUAL 1)
+    if(NOT EXISTS ${ROCBLAS_INC_DIR}/rocblas.h)
+    message("ERROR: rocBLAS headers can't be found under specified path. Please set ROCBLAS_INC_DIR path. Current value is : " ${ROCBLAS_INC_DIR})
+    RETURN()
+    endif()
+
+    if(NOT EXISTS "${ROCBLAS_LIB_DIR}/lib${ROCBLAS_LIB}.so")
+      message("ERROR: rocBLAS library can't be found under specified path. Please set ROCBLAS_LIB_DIR path. Current value is : " ${ROCBLAS_LIB_DIR})
+      RETURN()
+    endif()
+  endif()
+endif()
+
+
+if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
+  message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+  RETURN()
+endif()
+
+## define include directories
+include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${ROCM_PATH}/include)
+# Add directories to look for library files to link
+link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
+## additional libraries
+set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so numa hsa-runtime64 )
+
+## define source files
+set(SOURCES src/rvs_module.cpp src/action.cpp src/tfb_worker.cpp)
+
+## define target
+add_library( ${RVS_TARGET} SHARED ${SOURCES})
+set_target_properties(${RVS_TARGET} PROPERTIES
+        SUFFIX .so.${LIB_VERSION_STRING}
+        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
+add_dependencies(${RVS_TARGET} rvslibrt rvslib)
+
+add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+COMMAND ln -fs ./lib${RVS}.so.${LIB_VERSION_STRING} lib${RVS}.so.${VERSION_MAJOR} WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+COMMAND ln -fs ./lib${RVS}.so.${VERSION_MAJOR} lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+)
+
+install(TARGETS ${RVS_TARGET} LIBRARY DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
+install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR}" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
+install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
+
+# TEST SECTION
+#if (RVS_BUILD_TESTS)
+#  add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+#  COMMAND ln -fs ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR} ${RVS_BINTEST_FOLDER}/lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+#  )
+#  include(${CMAKE_CURRENT_SOURCE_DIR}/tests.cmake)
+#endif()
diff --git a/transferbench.so/include/Compatibility.hpp b/transferbench.so/include/Compatibility.hpp
new file mode 100644
index 00000000..5e76cf50
--- /dev/null
+++ b/transferbench.so/include/Compatibility.hpp
@@ -0,0 +1,93 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__NVCC__)
+
+#include <cuda_runtime.h>
+
+// ROCm specific
+#define __builtin_amdgcn_s_memrealtime                     clock64
+#define gcnArchName                                        name
+
+// Datatypes
+#define hipDeviceProp_t                                    cudaDeviceProp
+#define hipError_t                                         cudaError_t
+#define hipEvent_t                                         cudaEvent_t
+#define hipStream_t                                        cudaStream_t
+
+// Enumerations
+#define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
+#define hipDeviceAttributeMaxSharedMemoryPerMultiprocessor cudaDevAttrMaxSharedMemoryPerMultiprocessor
+#define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
+#define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
+#define hipFuncCachePreferShared                           cudaFuncCachePreferShared
+#define hipMemcpyDefault                                   cudaMemcpyDefault
+#define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
+#define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
+#define hipSuccess                                         cudaSuccess
+
+// Functions
+#define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
+#define hipDeviceEnablePeerAccess                          cudaDeviceEnablePeerAccess
+#define hipDeviceGetAttribute                              cudaDeviceGetAttribute
+#define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
+#define hipDeviceSetCacheConfig                            cudaDeviceSetCacheConfig
+#define hipDeviceSynchronize                               cudaDeviceSynchronize
+#define hipEventCreate                                     cudaEventCreate
+#define hipEventDestroy                                    cudaEventDestroy
+#define hipEventElapsedTime                                cudaEventElapsedTime
+#define hipEventRecord                                     cudaEventRecord
+#define hipFree                                            cudaFree
+#define hipGetDeviceCount                                  cudaGetDeviceCount
+#define hipGetDeviceProperties                             cudaGetDeviceProperties
+#define hipGetErrorString                                  cudaGetErrorString
+#define hipHostFree                                        cudaFreeHost
+#define hipHostMalloc                                      cudaMallocHost
+#define hipMalloc                                          cudaMalloc
+#define hipMemcpy                                          cudaMemcpy
+#define hipMemcpyAsync                                     cudaMemcpyAsync
+#define hipMemset                                          cudaMemset
+#define hipMemsetAsync                                     cudaMemsetAsync
+#define hipSetDevice                                       cudaSetDevice
+#define hipStreamCreate                                    cudaStreamCreate
+#define hipStreamDestroy                                   cudaStreamDestroy
+#define hipStreamSynchronize                               cudaStreamSynchronize
+
+// Define float4 addition operator for NVIDIA platform
+__device__ inline float4& operator +=(float4& a, const float4& b)
+{
+  a.x += b.x;
+  a.y += b.y;
+  a.z += b.z;
+  a.w += b.w;
+  return a;
+}
+
+#else
+
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+#include <hsa/hsa_ext_amd.h>
+
+#endif
diff --git a/transferbench.so/include/EnvVars.hpp b/transferbench.so/include/EnvVars.hpp
new file mode 100644
index 00000000..bac0746e
--- /dev/null
+++ b/transferbench.so/include/EnvVars.hpp
@@ -0,0 +1,510 @@
+/*
+Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ENVVARS_HPP
+#define ENVVARS_HPP
+
+#include <algorithm>
+#include <random>
+#include <time.h>
+#include "Compatibility.hpp"
+#include "Kernels.hpp"
+
+#define TB_VERSION "1.22"
+
+extern char const MemTypeStr[];
+extern char const ExeTypeStr[];
+
+enum ConfigModeEnum
+{
+  CFG_FILE  = 0,
+  CFG_P2P   = 1,
+  CFG_SWEEP = 2
+};
+
+// This class manages environment variable that affect TransferBench
+class EnvVars
+{
+public:
+  // Default configuration values
+  int const DEFAULT_NUM_WARMUPS          =  1;
+  int const DEFAULT_NUM_ITERATIONS       = 10;
+  int const DEFAULT_SAMPLING_FACTOR      =  1;
+
+  // Peer-to-peer Benchmark preset defaults
+  int const DEFAULT_P2P_NUM_CPU_SE    = 4;
+
+  // Sweep-preset defaults
+  std::string const DEFAULT_SWEEP_SRC = "CG";
+  std::string const DEFAULT_SWEEP_EXE = "CDG";
+  std::string const DEFAULT_SWEEP_DST = "CG";
+  int const DEFAULT_SWEEP_MIN         = 1;
+  int const DEFAULT_SWEEP_MAX         = 24;
+  int const DEFAULT_SWEEP_TEST_LIMIT  = 0;
+  int const DEFAULT_SWEEP_TIME_LIMIT  = 0;
+
+  // Environment variables
+  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
+  int byteOffset;        // Byte-offset for memory allocations
+  int continueOnError;   // Continue tests even after mismatch detected
+  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
+  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
+  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
+  int numWarmups;        // Number of un-timed warmup iterations to perform
+  int outputToCsv;       // Output in CSV format
+  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
+  int sharedMemBytes;    // Amount of shared memory to use per threadblock
+  int useInteractive;    // Pause for user-input before starting transfer loop
+  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
+  int usePrepSrcKernel;  // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
+  int useSingleStream;   // Use a single stream per GPU GFX executor instead of stream per Transfer
+  int validateDirect;    // Validate GPU destination memory directly instead of staging GPU memory on host
+
+  std::vector<float> fillPattern; // Pattern of floats used to fill source data
+
+  // Environment variables only for Benchmark-preset
+  int useRemoteRead;     // Use destination memory type as executor instead of source memory type
+  int useDmaCopy;        // Use DMA copy instead of GPU copy
+  int numGpuSubExecs;    // Number of GPU subexecutors to use
+  int numCpuSubExecs;    // Number of CPU subexecttors to use
+
+  // Environment variables only for Sweep-preset
+  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
+  int sweepMax;          // Max number of simulatneous Transfers to be executed per test
+  int sweepTestLimit;    // Max number of tests to run during sweep (0 = no limit)
+  int sweepTimeLimit;    // Max number of seconds to run sweep for  (0 = no limit)
+  int sweepXgmiMin;      // Min number of XGMI hops for Transfers
+  int sweepXgmiMax;      // Max number of XGMI hops for Transfers (-1 = no limit)
+  int sweepSeed;         // Random seed to use
+  int sweepRandBytes;    // Whether or not to use random number of bytes per Transfer
+  std::string sweepSrc;  // Set of src memory types to be swept
+  std::string sweepExe;  // Set of executors to be swept
+  std::string sweepDst;  // Set of dst memory types to be swept
+
+  // Developer features
+  int enableDebug;       // Enable debug output
+  int gpuKernel;         // Which GPU kernel to use
+
+  // Used to track current configuration mode
+  ConfigModeEnum configMode;
+
+  // Random generator
+  std::default_random_engine *generator;
+
+  // Track how many CPUs are available per NUMA node
+  std::vector<int> numCpusPerNuma;
+
+  // Constructor that collects values
+  EnvVars()
+  {
+    int maxSharedMemBytes = 0;
+    HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes,
+                                   hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0));
+#if !defined(__NVCC__)
+    int defaultSharedMemBytes = maxSharedMemBytes / 2 + 1;
+#else
+    int defaultSharedMemBytes = 0;
+#endif
+
+    int numDeviceCUs = 0;
+    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0));
+
+    int numDetectedCpus = numa_num_configured_nodes();
+    int numDetectedGpus;
+    HIP_CALL(hipGetDeviceCount(&numDetectedGpus));
+
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, 0));
+    std::string fullName = prop.gcnArchName;
+    std::string archName = fullName.substr(0, fullName.find(':'));
+
+    // Different hardware pick different GPU kernels
+    // This performance difference is generally only noticable when executing fewer CUs
+    int defaultGpuKernel = 0;
+    if      (archName == "gfx906") defaultGpuKernel = 13;
+    else if (archName == "gfx90a") defaultGpuKernel = 9;
+
+    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
+    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    continueOnError   = GetEnvVar("CONTINUE_ON_ERROR"   , 0);
+    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
+    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
+    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
+    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
+    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
+    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
+    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , defaultSharedMemBytes);
+    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
+    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
+    usePrepSrcKernel  = GetEnvVar("USE_PREP_KERNEL"     , 0);
+    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);
+    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
+    enableDebug       = GetEnvVar("DEBUG"               , 0);
+    gpuKernel         = GetEnvVar("GPU_KERNEL"          , defaultGpuKernel);
+
+    // P2P Benchmark related
+    useRemoteRead    = GetEnvVar("USE_REMOTE_READ"      , 0);
+    useDmaCopy       = GetEnvVar("USE_GPU_DMA"          , 0);
+    numGpuSubExecs   = GetEnvVar("NUM_GPU_SE"           , useDmaCopy ? 1 : numDeviceCUs);
+    numCpuSubExecs   = GetEnvVar("NUM_CPU_SE"           , DEFAULT_P2P_NUM_CPU_SE);
+
+    // Sweep related
+    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
+    sweepMax          = GetEnvVar("SWEEP_MAX"           , DEFAULT_SWEEP_MAX);
+    sweepSrc          = GetEnvVar("SWEEP_SRC"           , DEFAULT_SWEEP_SRC);
+    sweepExe          = GetEnvVar("SWEEP_EXE"           , DEFAULT_SWEEP_EXE);
+    sweepDst          = GetEnvVar("SWEEP_DST"           , DEFAULT_SWEEP_DST);
+    sweepTestLimit    = GetEnvVar("SWEEP_TEST_LIMIT"    , DEFAULT_SWEEP_TEST_LIMIT);
+    sweepTimeLimit    = GetEnvVar("SWEEP_TIME_LIMIT"    , DEFAULT_SWEEP_TIME_LIMIT);
+    sweepXgmiMin      = GetEnvVar("SWEEP_XGMI_MIN"      , 0);
+    sweepXgmiMax      = GetEnvVar("SWEEP_XGMI_MAX"      , -1);
+    sweepRandBytes    = GetEnvVar("SWEEP_RAND_BYTES"    , 0);
+
+    // Determine random seed
+    char *sweepSeedStr = getenv("SWEEP_SEED");
+    sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL));
+    generator = new std::default_random_engine(sweepSeed);
+
+    // Check for fill pattern
+    char* pattern = getenv("FILL_PATTERN");
+    if (pattern != NULL)
+    {
+      if (usePrepSrcKernel)
+      {
+        printf("[ERROR] Unable to use FILL_PATTERN and USE_PREP_KERNEL together\n");
+        exit(1);
+      }
+
+      int patternLen = strlen(pattern);
+      if (patternLen % 2)
+      {
+        printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
+        exit(1);
+      }
+
+      // Read in bytes
+      std::vector<unsigned char> bytes;
+      unsigned char val = 0;
+      for (int i = 0; i < patternLen; i++)
+      {
+        if ('0' <= pattern[i] && pattern[i] <= '9')
+          val += (pattern[i] - '0');
+        else if ('A' <= pattern[i] && pattern[i] <= 'F')
+          val += (pattern[i] - 'A' + 10);
+        else if ('a' <= pattern[i] && pattern[i] <= 'f')
+          val += (pattern[i] - 'a' + 10);
+        else
+        {
+          printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F).  (not %c)\n", pattern[i]);
+          exit(1);
+        }
+
+        if (i % 2 == 0)
+          val <<= 4;
+        else
+        {
+          bytes.push_back(val);
+          val = 0;
+        }
+      }
+
+      // Reverse bytes (input is assumed to be given in big-endian)
+      std::reverse(bytes.begin(), bytes.end());
+
+      // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
+      int copies;
+      switch (patternLen % 8)
+      {
+      case 0:  copies = 1; break;
+      case 4:  copies = 2; break;
+      default: copies = 4; break;
+      }
+
+      // Fill floats
+      int numFloats = copies * patternLen / 8;
+      fillPattern.resize(numFloats);
+      unsigned char* rawData = (unsigned char*) fillPattern.data();
+      for (int i = 0; i < numFloats * 4; i++)
+        rawData[i] = bytes[i % bytes.size()];
+    }
+    else fillPattern.clear();
+
+    // Perform some basic validation
+    if (numCpuDevices > numDetectedCpus)
+    {
+      printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
+      exit(1);
+    }
+    if (numGpuDevices > numDetectedGpus)
+    {
+      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
+      exit(1);
+    }
+    if (byteOffset % sizeof(float))
+    {
+      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
+      exit(1);
+    }
+    if (numWarmups < 0)
+    {
+      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
+      exit(1);
+    }
+    if (samplingFactor < 1)
+    {
+      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
+      exit(1);
+    }
+    if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
+    {
+      printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes);
+      exit(1);
+    }
+    if (blockBytes <= 0 || blockBytes % 4)
+    {
+      printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
+      exit(1);
+    }
+
+    if (numGpuSubExecs <= 0)
+    {
+      printf("[ERROR] NUM_GPU_SE must be greater than 0\n");
+      exit(1);
+    }
+
+    if (numCpuSubExecs <= 0)
+    {
+      printf("[ERROR] NUM_CPU_SE must be greater than 0\n");
+      exit(1);
+    }
+
+    for (auto ch : sweepSrc)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+    }
+
+    for (auto ch : sweepDst)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+    }
+
+    for (auto ch : sweepExe)
+    {
+      if (!strchr(ExeTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+    }
+    if (gpuKernel < 0 || gpuKernel > NUM_GPU_KERNELS)
+    {
+      printf("[ERROR] GPU kernel must be between 0 and %d\n", NUM_GPU_KERNELS);
+      exit(1);
+    }
+
+    // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
+    numCpusPerNuma.resize(numDetectedCpus);
+    int const totalCpus = numa_num_configured_cpus();
+    for (int i = 0; i < totalCpus; i++)
+      numCpusPerNuma[numa_node_of_cpu(i)]++;
+
+    // Check for deprecated env vars
+    if (getenv("USE_HIP_CALL"))
+    {
+      printf("[WARN] USE_HIP_CALL has been deprecated.  Please use DMA executor 'D' or set USE_GPU_DMA for P2P-Benchmark preset\n");
+      exit(1);
+    }
+
+    char* enableSdma = getenv("HSA_ENABLE_SDMA");
+    if (enableSdma && !strcmp(enableSdma, "0"))
+    {
+      printf("[WARN] DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0.  Copies will fallback to blit kernels\n");
+    }
+  }
+
+  // Display info on the env vars that can be used
+  static void DisplayUsage()
+  {
+    printf("Environment variables:\n");
+    printf("======================\n");
+    printf(" BLOCK_BYTES=B          - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
+    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
+    printf(" CONTINUE_ON_ERROR      - Continue tests even after mismatch detected\n");
+    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
+    printf(" NUM_GPU_DEVICES=X      - Restrict number of GPUs to X.  May not be greater than # detected HIP devices\n");
+    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
+    printf(" NUM_WARMUPS=W          - Perform W untimed warmup iteration(s) per test\n");
+    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
+    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" SHARED_MEM_BYTES=X     - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
+    printf(" USE_INTERACTIVE        - Pause for user-input before starting transfer loop\n");
+    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
+    printf(" USE_PREP_KERNEL        - Use GPU kernel to initialize source data array pattern\n");
+    printf(" USE_SINGLE_STREAM      - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
+    printf(" VALIDATE_DIRECT        - Validate GPU destination memory directly instead of staging GPU memory on host\n");
+  }
+
+  // Helper macro to switch between CSV and terminal output
+#define PRINT_EV(NAME, VALUE, DESCRIPTION)                              \
+  printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
+
+#define PRINT_ES(NAME, VALUE, DESCRIPTION)                           \
+  printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
+
+  // Display env var settings
+  void DisplayEnvVars() const
+  {
+    if (!outputToCsv)
+    {
+      printf("TransferBench v%s\n", TB_VERSION);
+      printf("=====================================================\n");
+      printf("[Common]\n");
+    }
+    else
+      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
+
+    PRINT_EV("BLOCK_BYTES", blockBytes,
+             std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
+    PRINT_EV("BYTE_OFFSET", byteOffset,
+             std::string("Using byte offset of " + std::to_string(byteOffset)));
+    PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
+             std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
+    PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
+             (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
+    PRINT_EV("GPU_KERNEL", gpuKernel,
+             std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]");
+    PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
+             std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
+    PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
+             std::string("Using ") + std::to_string(numGpuDevices) + " GPU devices");
+    PRINT_EV("NUM_ITERATIONS", numIterations,
+             std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
+             + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
+    PRINT_EV("NUM_WARMUPS", numWarmups,
+             std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
+    PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
+             std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
+    PRINT_EV("USE_INTERACTIVE", useInteractive,
+             std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
+    PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
+             std::string("Use ") + (usePcieIndexing ? "PCIe" : "HIP") + " GPU device indexing");
+    PRINT_EV("USE_PREP_KERNEL", usePrepSrcKernel,
+             std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
+    PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
+             std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
+    PRINT_EV("VALIDATE_DIRECT", validateDirect,
+             std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
+    printf("\n");
+  };
+
+  // Display env var for P2P Benchmark preset
+  void DisplayP2PBenchmarkEnvVars() const
+  {
+    DisplayEnvVars();
+
+    if (!outputToCsv)
+      printf("[P2P Related]\n");
+
+    PRINT_EV("NUM_CPU_SE", numCpuSubExecs,
+             std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
+    PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
+             std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
+    PRINT_EV("USE_GPU_DMA", useDmaCopy,
+             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
+    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
+             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
+    printf("\n");
+  }
+
+  // Display env var settings
+  void DisplaySweepEnvVars() const
+  {
+    DisplayEnvVars();
+
+    if (!outputToCsv)
+      printf("[Sweep Related]\n");
+    PRINT_ES("SWEEP_DST", sweepDst.c_str(),
+             std::string("Destination Memory Types to sweep"));
+    PRINT_ES("SWEEP_EXE", sweepExe.c_str(),
+             std::string("Executor Types to sweep"));
+    PRINT_EV("SWEEP_MAX", sweepMax,
+             std::string("Max simultaneous transfers (0 = no limit)"));
+    PRINT_EV("SWEEP_MIN", sweepMin,
+             std::string("Min simultaenous transfers"));
+    PRINT_EV("SWEEP_RAND_BYTES", sweepRandBytes,
+             std::string("Using ") + (sweepRandBytes ? "random" : "constant") + " number of bytes per Transfer");
+    PRINT_EV("SWEEP_SEED", sweepSeed,
+             std::string("Random seed set to ") + std::to_string(sweepSeed));
+    PRINT_ES("SWEEP_SRC", sweepSrc.c_str(),
+             std::string("Source Memory Types to sweep"));
+    PRINT_EV("SWEEP_TEST_LIMIT", sweepTestLimit,
+             std::string("Max number of tests to run during sweep (0 = no limit)"));
+    PRINT_EV("SWEEP_TIME_LIMIT", sweepTimeLimit,
+             std::string("Max number of seconds to run sweep for  (0 = no limit)"));
+    PRINT_EV("SWEEP_XGMI_MAX", sweepXgmiMax,
+             std::string("Max number of XGMI hops for Transfers (-1 = no limit)"));
+    PRINT_EV("SWEEP_XGMI_MIN", sweepXgmiMin,
+             std::string("Min number of XGMI hops for Transfers"));
+    printf("\n");
+  }
+
+  // Helper function that gets parses environment variable or sets to default value
+  static int GetEnvVar(std::string const& varname, int defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return atoi(getenv(varname.c_str()));
+    return defaultValue;
+  }
+
+  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return getenv(varname.c_str());
+    return defaultValue;
+  }
+};
+
+#endif
diff --git a/transferbench.so/include/GetClosestNumaNode.hpp b/transferbench.so/include/GetClosestNumaNode.hpp
new file mode 100644
index 00000000..266f1eb1
--- /dev/null
+++ b/transferbench.so/include/GetClosestNumaNode.hpp
@@ -0,0 +1,149 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Helper macro for checking HSA calls
+#define HSA_CHECK(cmd)                                                  \
+  do {                                                                  \
+    hsa_status_t error = (cmd);                                         \
+    if (error != HSA_STATUS_SUCCESS) {                                  \
+      const char* errString = NULL;                                     \
+      hsa_status_string(error, &errString);                             \
+      std::cerr << "Encountered HSA error (" << errString << ") at line " \
+                << __LINE__ << " in file " << __FILE__ << "\n";         \
+      exit(-1);                                                         \
+    }                                                                   \
+  } while (0)
+
+// Structure to hold HSA agent information
+#if !defined(__NVCC__)
+struct AgentData
+{
+  bool isInitialized;
+  std::vector<hsa_agent_t> cpuAgents;
+  std::vector<hsa_agent_t> gpuAgents;
+  std::vector<int> closestNumaNode;
+};
+
+// Simple callback function to return any memory pool for an agent
+hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)
+{
+  hsa_amd_memory_pool_t* poolData = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
+
+  // Check memory pool flags
+  uint32_t poolFlags;
+  HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags));
+
+  // Only consider coarse-grained pools
+  if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS;
+
+  *poolData = pool;
+  return HSA_STATUS_SUCCESS;
+}
+
+// Callback function to gather HSA agent information
+hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data)
+{
+  AgentData* agentData = reinterpret_cast<AgentData*>(data);
+
+  // Get the device type
+  hsa_device_type_t deviceType;
+  HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType));
+  if (deviceType == HSA_DEVICE_TYPE_CPU)
+    agentData->cpuAgents.push_back(agent);
+  if (deviceType == HSA_DEVICE_TYPE_GPU)
+  {
+    agentData->gpuAgents.push_back(agent);
+    agentData->closestNumaNode.push_back(0);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+AgentData& GetAgentData()
+{
+  static AgentData agentData = {};
+
+  if (!agentData.isInitialized)
+  {
+    agentData.isInitialized = true;
+
+    // Add all detected agents to the list
+    HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
+
+    // Loop over each GPU
+    for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++)
+    {
+      // Collect memory pool
+      hsa_amd_memory_pool_t pool;
+      HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
+
+      // Loop over each CPU agent and check distance
+      int bestDistance = -1;
+      for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++)
+      {
+        // Determine number of hops from GPU memory pool to CPU agent
+        uint32_t hops = 0;
+        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
+                                                     pool,
+                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
+                                                     &hops));
+        // Gather link info
+        hsa_amd_memory_pool_link_info_t* link_info =
+          (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
+        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
+                                                     pool,
+                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
+                                                     link_info));
+        int numaDist = 0;
+        for (int k = 0; k < hops; k++)
+        {
+          numaDist += link_info[k].numa_distance;
+        }
+        if (bestDistance == -1 || numaDist < bestDistance)
+        {
+          agentData.closestNumaNode[i] = j;
+          bestDistance = numaDist;
+        }
+        free(link_info);
+      }
+    }
+  }
+  return agentData;
+}
+#endif
+
+// Returns closest CPU NUMA node to provided GPU
+// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing
+int GetClosestNumaNode(int gpuIdx)
+{
+#if defined(__NVCC__)
+  return -1;
+#else
+  AgentData& agentData = GetAgentData();
+  if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size())
+  {
+    printf("[ERROR] GPU index out is out of bounds\n");
+    exit(1);
+  }
+  return agentData.closestNumaNode[gpuIdx];
+#endif
+}
diff --git a/transferbench.so/include/Kernels.hpp b/transferbench.so/include/Kernels.hpp
new file mode 100644
index 00000000..dcb6bf07
--- /dev/null
+++ b/transferbench.so/include/Kernels.hpp
@@ -0,0 +1,409 @@
+/*
+Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#define PackedFloat_t   float4
+#define WARP_SIZE       64
+#define BLOCKSIZE       256
+#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
+#define MEMSET_CHAR     75
+#define MEMSET_VAL      13323083.0f
+
+// Each subExecutor is provided with subarrays to work on
+#define MAX_SRCS 16
+#define MAX_DSTS 16
+struct SubExecParam
+{
+  size_t    N;                                  // Number of floats this subExecutor works on
+  int       numSrcs;                            // Number of source arrays
+  int       numDsts;                            // Number of destination arrays
+  float*    src[MAX_SRCS];                      // Source array pointers
+  float*    dst[MAX_DSTS];                      // Destination array pointers
+  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
+  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
+};
+
+void CpuReduceKernel(SubExecParam const& p)
+{
+  int const& numSrcs = p.numSrcs;
+  int const& numDsts = p.numDsts;
+
+  if (numSrcs == 0)
+  {
+    for (int i = 0; i < numDsts; ++i)
+      memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
+  }
+  else if (numSrcs == 1)
+  {
+    float const* __restrict__ src = p.src[0];
+    for (int i = 0; i < numDsts; ++i)
+    {
+      memcpy(p.dst[i], src, p.N * sizeof(float));
+    }
+  }
+  else
+  {
+    for (int j = 0; j < p.N; j++)
+    {
+      float sum = p.src[0][j];
+      for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
+      for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
+    }
+  }
+}
+
+std::string PrepSrcValueString()
+{
+  return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
+}
+
+__host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
+{
+  return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
+}
+
+// GPU kernel to prepare src buffer data
+__global__ void
+PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
+{
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < N;
+       idx += blockDim.x * gridDim.x)
+  {
+    ptr[idx] = PrepSrcValue(srcBufferIdx, idx);
+  }
+}
+
+// Helper function for memset
+template <typename T> __device__ __forceinline__ T      MemsetVal();
+template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
+template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
+
+// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
+template <int LOOP1_UNROLL>
+__global__ void __launch_bounds__(BLOCKSIZE)
+GpuReduceKernel(SubExecParam* params)
+{
+  int64_t startCycle = wall_clock64();
+
+  // Operate on wavefront granularity
+  SubExecParam& p    = params[blockIdx.x];
+  int const numSrcs  = p.numSrcs;
+  int const numDsts  = p.numDsts;
+  int const waveId   = threadIdx.x / WARP_SIZE; // Wavefront number
+  int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
+
+  // 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration
+  // Determine the number of packed floats processed by the first loop
+  size_t       Nrem        = p.N;
+  size_t const loop1Npack  = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
+  size_t const loop1Nelem  = loop1Npack * FLOATS_PER_PACK;
+  size_t const loop1Inc    = BLOCKSIZE * LOOP1_UNROLL;
+  size_t       loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
+
+  while (loop1Offset < loop1Npack)
+  {
+    PackedFloat_t vals[LOOP1_UNROLL] = {};
+
+    if (numSrcs == 0)
+    {
+      #pragma unroll
+      for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal<float4>();
+    }
+    else
+    {
+      for (int i = 0; i < numSrcs; ++i)
+      {
+        PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset;
+        #pragma unroll
+        for (int u = 0; u < LOOP1_UNROLL; ++u)
+          vals[u] += *(packedSrc + u * WARP_SIZE);
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset;
+      #pragma unroll
+      for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u];
+    }
+    loop1Offset += loop1Inc;
+  }
+  Nrem -= loop1Nelem;
+
+  if (Nrem > 0)
+  {
+    // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
+    // NOTE: Using int32_t due to smaller size requirements
+    int32_t const loop2Npack  = Nrem / FLOATS_PER_PACK;
+    int32_t const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
+    int32_t const loop2Inc    = BLOCKSIZE;
+    int32_t       loop2Offset = threadIdx.x;
+
+    while (loop2Offset < loop2Npack)
+    {
+      PackedFloat_t val;
+      if (numSrcs == 0)
+      {
+        val = MemsetVal<float4>();
+      }
+      else
+      {
+        val = {};
+        for (int i = 0; i < numSrcs; ++i)
+        {
+          PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset;
+          val += *packedSrc;
+        }
+      }
+
+      for (int i = 0; i < numDsts; ++i)
+      {
+        PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset;
+        *packedDst = val;
+      }
+      loop2Offset += loop2Inc;
+    }
+    Nrem -= loop2Nelem;
+
+    // Deal with leftovers less than FLOATS_PER_PACK)
+    if (threadIdx.x < Nrem)
+    {
+      int offset = loop1Nelem + loop2Nelem + threadIdx.x;
+      float val = 0;
+      if (numSrcs == 0)
+      {
+        val = MEMSET_VAL;
+      }
+      else
+      {
+        for (int i = 0; i < numSrcs; ++i)
+          val += p.src[i][offset];
+      }
+
+      for (int i = 0; i < numDsts; ++i)
+        p.dst[i][offset] = val;
+    }
+  }
+
+  __syncthreads();
+  if (threadIdx.x == 0)
+  {
+    p.startCycle = startCycle;
+    p.stopCycle  = wall_clock64();
+  }
+}
+
+template <typename FLOAT_TYPE, int UNROLL_FACTOR>
+__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
+{
+  int    constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
+  size_t constexpr loopPackInc      = BLOCKSIZE * UNROLL_FACTOR;
+  size_t constexpr numPacksPerWave  = WARP_SIZE * UNROLL_FACTOR;
+  int    const     waveId           = threadIdx.x / WARP_SIZE;            // Wavefront number
+  int    const     threadId         = threadIdx.x % WARP_SIZE;            // Thread index within wavefront
+  int    const     numSrcs          = p.numSrcs;
+  int    const     numDsts          = p.numDsts;
+  size_t const     numPacksDone     = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave;
+  size_t const     numFloatsLeft    = N - numPacksDone * numFloatsPerPack;
+  size_t           loopPackOffset   = waveId * numPacksPerWave + threadId;
+
+  while (loopPackOffset < numPacksDone)
+  {
+    FLOAT_TYPE vals[UNROLL_FACTOR];
+
+    if (numSrcs == 0)
+    {
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal<FLOAT_TYPE>();
+    }
+    else
+    {
+      FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset;
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        vals[u] = *(src0Ptr + u * WARP_SIZE);
+
+      for (int i = 1; i < numSrcs; ++i)
+      {
+        FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset;
+
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          vals[u] += *(srcPtr + u * WARP_SIZE);
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset;
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        *(dstPtr + u * WARP_SIZE) = vals[u];
+    }
+    loopPackOffset += loopPackInc;
+  }
+
+  return numFloatsLeft;
+}
+
+template <typename FLOAT_TYPE, int UNROLL_FACTOR>
+__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N)
+{
+  // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
+  int    constexpr numFloatsPerRead      = sizeof(FLOAT_TYPE) / sizeof(float);
+  size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead;
+  size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
+  size_t const     numFloatsLeft         = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
+  size_t const     numFloatsDone         = N - numFloatsLeft;
+  int    const     numSrcs               = p.numSrcs;
+  int    const     numDsts               = p.numDsts;
+
+  for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop)
+  {
+    FLOAT_TYPE tmp[UNROLL_FACTOR];
+
+    if (numSrcs == 0)
+    {
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          tmp[u] = MemsetVal<FLOAT_TYPE>();
+    }
+    else
+    {
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop]));
+
+      for (int i = 1; i < numSrcs; ++i)
+      {
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop]));
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+      {
+        *((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u];
+      }
+    }
+  }
+  return numFloatsLeft;
+}
+
+template <typename FLOAT_TYPE>
+__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
+{
+  switch (unroll)
+  {
+  case  1: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
+  case  2: return GpuReduceFuncImpl<FLOAT_TYPE,  2>(p, offset, N);
+  case  3: return GpuReduceFuncImpl<FLOAT_TYPE,  3>(p, offset, N);
+  case  4: return GpuReduceFuncImpl<FLOAT_TYPE,  4>(p, offset, N);
+  case  5: return GpuReduceFuncImpl<FLOAT_TYPE,  5>(p, offset, N);
+  case  6: return GpuReduceFuncImpl<FLOAT_TYPE,  6>(p, offset, N);
+  case  7: return GpuReduceFuncImpl<FLOAT_TYPE,  7>(p, offset, N);
+  case  8: return GpuReduceFuncImpl<FLOAT_TYPE,  8>(p, offset, N);
+  case  9: return GpuReduceFuncImpl<FLOAT_TYPE,  9>(p, offset, N);
+  case 10: return GpuReduceFuncImpl<FLOAT_TYPE, 10>(p, offset, N);
+  case 11: return GpuReduceFuncImpl<FLOAT_TYPE, 11>(p, offset, N);
+  case 12: return GpuReduceFuncImpl<FLOAT_TYPE, 12>(p, offset, N);
+  case 13: return GpuReduceFuncImpl<FLOAT_TYPE, 13>(p, offset, N);
+  case 14: return GpuReduceFuncImpl<FLOAT_TYPE, 14>(p, offset, N);
+  case 15: return GpuReduceFuncImpl<FLOAT_TYPE, 15>(p, offset, N);
+  case 16: return GpuReduceFuncImpl<FLOAT_TYPE, 16>(p, offset, N);
+  default: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
+  }
+}
+
+// GPU copy kernel
+__global__ void __launch_bounds__(BLOCKSIZE)
+GpuReduceKernel2(SubExecParam* params)
+{
+  int64_t startCycle = wall_clock64();
+  SubExecParam& p = params[blockIdx.x];
+
+  size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
+  if (numFloatsLeft)
+    numFloatsLeft = GpuReduceFunc<float4>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
+
+  if (numFloatsLeft)
+  GpuReduceFunc<float>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
+
+  __threadfence_system();
+  if (threadIdx.x == 0)
+  {
+    p.startCycle = startCycle;
+    p.stopCycle  = wall_clock64();
+  }
+}
+
+#define NUM_GPU_KERNELS 18
+typedef void (*GpuKernelFuncPtr)(SubExecParam*);
+
+GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] =
+{
+  GpuReduceKernel<8>,
+  GpuReduceKernel<1>,
+  GpuReduceKernel<2>,
+  GpuReduceKernel<3>,
+  GpuReduceKernel<4>,
+  GpuReduceKernel<5>,
+  GpuReduceKernel<6>,
+  GpuReduceKernel<7>,
+  GpuReduceKernel<8>,
+  GpuReduceKernel<9>,
+  GpuReduceKernel<10>,
+  GpuReduceKernel<11>,
+  GpuReduceKernel<12>,
+  GpuReduceKernel<13>,
+  GpuReduceKernel<14>,
+  GpuReduceKernel<15>,
+  GpuReduceKernel<16>,
+  GpuReduceKernel2
+};
+
+std::string GpuKernelNames[NUM_GPU_KERNELS] =
+{
+  "Default - 8xUnroll",
+  "Unroll x1",
+  "Unroll x2",
+  "Unroll x3",
+  "Unroll x4",
+  "Unroll x5",
+  "Unroll x6",
+  "Unroll x7",
+  "Unroll x8",
+  "Unroll x9",
+  "Unroll x10",
+  "Unroll x11",
+  "Unroll x12",
+  "Unroll x13",
+  "Unroll x14",
+  "Unroll x15",
+  "Unroll x16",
+  "8xUnrollB",
+};
diff --git a/transferbench.so/include/TransferBench.hpp b/transferbench.so/include/TransferBench.hpp
new file mode 100644
index 00000000..6abbbc5b
--- /dev/null
+++ b/transferbench.so/include/TransferBench.hpp
@@ -0,0 +1,198 @@
+/*
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <vector>
+#include <sstream>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstdint>
+#include <set>
+#include <unistd.h>
+#include <map>
+#include <iostream>
+#include <sstream>
+
+#include "Compatibility.hpp"
+
+// Helper macro for catching HIP errors
+#define HIP_CALL(cmd)                                                                   \
+    do {                                                                                \
+        hipError_t error = (cmd);                                                       \
+        if (error != hipSuccess)                                                        \
+        {                                                                               \
+            std::cerr << "Encountered HIP error (" << hipGetErrorString(error)          \
+                      << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
+            exit(-1);                                                                   \
+        }                                                                               \
+    } while (0)
+
+#include "EnvVars.hpp"
+
+// Simple configuration parameters
+size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer
+
+// Different src/dst memory types supported
+typedef enum
+{
+  MEM_CPU          = 0, // Coarse-grained pinned CPU memory
+  MEM_GPU          = 1, // Coarse-grained global GPU memory
+  MEM_CPU_FINE     = 2, // Fine-grained pinned CPU memory
+  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
+  MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
+  MEM_NULL         = 5, // NULL memory - used for empty
+} MemType;
+
+typedef enum
+{
+  EXE_CPU          = 0, // CPU executor              (subExecutor = CPU thread)
+  EXE_GPU_GFX      = 1, // GPU kernel-based executor (subExecutor = threadblock/CU)
+  EXE_GPU_DMA      = 2, // GPU SDMA-based executor   (subExecutor = streams)
+} ExeType;
+
+bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); }
+bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
+bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
+bool IsCpuType(ExeType e) { return (e == EXE_CPU); };
+
+char const MemTypeStr[7] = "CGBFUN";
+char const ExeTypeStr[4] = "CGD";
+char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
+
+MemType inline CharToMemType(char const c)
+{
+  char const* val = strchr(MemTypeStr, toupper(c));
+  if (*val) return (MemType)(val - MemTypeStr);
+  printf("[ERROR] Unexpected memory type (%c)\n", c);
+  exit(1);
+}
+
+ExeType inline CharToExeType(char const c)
+{
+  char const* val = strchr(ExeTypeStr, toupper(c));
+  if (*val) return (ExeType)(val - ExeTypeStr);
+  printf("[ERROR] Unexpected executor type (%c)\n", c);
+  exit(1);
+}
+
+// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified)
+// then writes the summation to each of the specified destination memory location(s)
+struct Transfer
+{
+  int                       transferIndex;      // Transfer identifier (within a Test)
+  ExeType                   exeType;            // Transfer executor type
+  int                       exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
+  int                       numSubExecs;        // Number of subExecutors to use for this Transfer
+  size_t                    numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
+  size_t                    numBytesActual;     // Actual number of bytes to copy
+  double                    transferTime;       // Time taken in milliseconds
+
+  int                       numSrcs;            // Number of sources
+  std::vector<MemType>      srcType;            // Source memory types
+  std::vector<int>          srcIndex;           // Source device indice
+  std::vector<float*>       srcMem;             // Source memory
+
+  int                       numDsts;            // Number of destinations
+  std::vector<MemType>      dstType;            // Destination memory type
+  std::vector<int>          dstIndex;           // Destination device index
+  std::vector<float*>       dstMem;             // Destination memory
+
+  std::vector<SubExecParam> subExecParam;       // Defines subarrays assigned to each threadblock
+  SubExecParam*             subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
+
+  // Prepares src/dst subarray pointers for each SubExecutor
+  void PrepareSubExecParams(EnvVars const& ev);
+
+  // Prepare source arrays with input data
+  bool PrepareSrc(EnvVars const& ev);
+
+  // Validate that destination data contains expected results
+  void ValidateDst(EnvVars const& ev);
+
+  // Prepare reference buffers
+  void PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx);
+
+  // String representation functions
+  std::string SrcToStr() const;
+  std::string DstToStr() const;
+};
+
+struct ExecutorInfo
+{
+  std::vector<Transfer*>   transfers;        // Transfers to execute
+  size_t                   totalBytes;       // Total bytes this executor transfers
+  int                      totalSubExecs;    // Total number of subExecutors to use
+
+  // For GPU-Executors
+  SubExecParam*            subExecParamGpu;  // GPU copy of subExecutor parameters
+  std::vector<hipStream_t> streams;
+  std::vector<hipEvent_t>  startEvents;
+  std::vector<hipEvent_t>  stopEvents;
+
+  // Results
+  double totalTime;
+};
+
+typedef std::pair<ExeType, int> Executor;
+typedef std::map<Executor, ExecutorInfo> TransferMap;
+
+// Display usage instructions
+void DisplayUsage(char const* cmdName);
+
+// Display detected GPU topology / CPU numa nodes
+void DisplayTopology(bool const outputToCsv);
+
+// Build array of test sizes based on sampling factor
+void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor,
+                       std::vector<size_t>& valuesofN);
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
+                  std::vector<MemType>& memType, std::vector<int>& memIndex);
+void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
+                  ExeType& exeType, int& exeIndex);
+
+void ParseTransfers(char* line, int numCpus, int numGpus,
+                    std::vector<Transfer>& transfers);
+
+void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
+                      std::vector<Transfer>& transfers, bool verbose = true);
+
+void EnablePeerAccess(int const deviceId, int const peerDeviceId);
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
+void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
+void CheckPages(char* byteArray, size_t numBytes, int targetId);
+void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
+
+// Return the maximum bandwidth measured for given (src/dst) pair
+double GetPeakBandwidth(EnvVars const& ev, size_t  const  N,
+                        int     const  isBidirectional,
+                        MemType const  srcType, int const srcIndex,
+                        MemType const  dstType, int const dstIndex);
+
+std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
+
+int RemappedIndex(int const origIdx, bool const isCpuType);
+int GetWallClockRate(int deviceId);
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
+std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);
diff --git a/transferbench.so/include/action.h b/transferbench.so/include/action.h
new file mode 100644
index 00000000..4c2518a5
--- /dev/null
+++ b/transferbench.so/include/action.h
@@ -0,0 +1,86 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTEST_SO_INCLUDE_ACTION_H_
+#define HIPTEST_SO_INCLUDE_ACTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <pci/pci.h>
+#ifdef __cplusplus
+}
+#endif
+
+#include <vector>
+#include <string>
+#include <map>
+
+#include "include/rvsactionbase.h"
+
+using std::vector;
+using std::string;
+using std::map;
+
+/**
+ * @class tfb_action
+ * @ingroup GST
+ *
+ * @brief GST action implementation class
+ *
+ * Derives from rvs::actionbase and implements actual action functionality
+ * in its run() method.
+ *
+ */
+class tfb_action: public rvs::actionbase {
+ public:
+    tfb_action();
+    virtual ~tfb_action();
+
+    virtual int run(void);
+
+    std::string m_transfer_type;
+
+ protected:
+    //! TRUE if JSON output is required
+    bool bjson;
+
+    bool get_all_tfb_config_keys(void);
+  /**
+  * @brief reads all common configuration keys from
+  * the module's properties collection
+  * @return true if no fatal error occured, false otherwise
+  */
+    bool get_all_common_config_keys(void);
+
+    int get_num_amd_gpu_devices(void);
+    bool start_tfb_runners();
+  /**
+  * @brief gets the number of ROCm compatible AMD GPUs
+  * @return run number of GPUs
+  */
+    int run_hip_tests();
+};
+
+#endif  // HIPTEST_SO_INCLUDE_ACTION_H_
diff --git a/transferbench.so/include/rvs_module.h b/transferbench.so/include/rvs_module.h
new file mode 100644
index 00000000..9edaec65
--- /dev/null
+++ b/transferbench.so/include/rvs_module.h
@@ -0,0 +1,31 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTEST_SO_INCLUDE_RVS_MODULE_H_
+#define HIPTEST_SO_INCLUDE_RVS_MODULE_H_
+
+#include "include/rvsliblog.h"
+
+
+#endif  // HIPTEST_SO_INCLUDE_RVS_MODULE_H_
diff --git a/transferbench.so/include/tfb_worker.h b/transferbench.so/include/tfb_worker.h
new file mode 100644
index 00000000..6eeda99a
--- /dev/null
+++ b/transferbench.so/include/tfb_worker.h
@@ -0,0 +1,74 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
+#define HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
+
+#include <string>
+#include <memory>
+#include "include/rvsthreadbase.h"
+#include "include/rvs_blas.h"
+#include "include/rvs_util.h"
+
+#define GST_RESULT_PASS_MESSAGE         "true"
+#define GST_RESULT_FAIL_MESSAGE         "false"
+
+
+/**
+ * @class tfbWorker
+ * @ingroup GST
+ *
+ * @brief tfbWorker action implementation class
+ *
+ * Derives from rvs::ThreadBase and implements actual action functionality
+ * in its run() method.
+ *
+ */
+class tfbWorker : public rvs::ThreadBase {
+ public:
+    tfbWorker();
+    virtual ~tfbWorker();
+
+    //! sets action name
+    void set_name(const std::string& name) { action_name = name; }
+    //! returns action name
+    const std::string& get_name(void) { return action_name; }
+
+    //! sets test path
+    void set_path(std::string pathname) { m_test_path = pathname; }
+    void set_args(std::string args) { m_test_args = args; }
+
+    const std::string& get_path(void) { return m_test_path; }
+    bool start_hip_tests(int &error, std::string &errdesc);
+ protected:
+    virtual void run(void);
+ protected:
+    //! name of the action
+    std::string action_name;
+    //! path to execute test
+    std::string m_test_path;
+    std::string m_test_args;
+};
+
+#endif  // HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
diff --git a/transferbench.so/src/.gitignore b/transferbench.so/src/.gitignore
new file mode 100644
index 00000000..6677c873
--- /dev/null
+++ b/transferbench.so/src/.gitignore
@@ -0,0 +1 @@
+/libmain.cpp
diff --git a/transferbench.so/src/action.cpp b/transferbench.so/src/action.cpp
new file mode 100644
index 00000000..50c721c5
--- /dev/null
+++ b/transferbench.so/src/action.cpp
@@ -0,0 +1,215 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include/action.h"
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <regex>
+#include <utility>
+#include <algorithm>
+#include <map>
+
+#define __HIP_PLATFORM_HCC__
+
+#include "include/rvs_key_def.h"
+#include "include/tfb_worker.h"
+#include "include/rvsactionbase.h"
+#include "include/rvsloglp.h"
+
+using std::string;
+using std::vector;
+using std::map;
+using std::regex;
+
+#define RVS_CONF_TFB_TYPE_KEY       "transfer-type"
+#define TFB_DEFAULT_TYPE            "p2p"
+#define TFB_NO_COMPATIBLE_GPUS          "No AMD compatible GPU found!"
+
+#define FLOATING_POINT_REGEX            "^[0-9]*\\.?[0-9]+$"
+
+#define JSON_CREATE_NODE_ERROR          "JSON cannot create node"
+
+/**
+ * @brief default class constructor
+ */
+tfb_action::tfb_action() {
+    bjson = false;
+}
+
+/**
+ * @brief class destructor
+ */
+tfb_action::~tfb_action() {
+    property.clear();
+}
+
+/**
+ * @brief runs the hip test session
+ * @return true if no error occured, false otherwise
+ */
+bool tfb_action::start_tfb_runners() {
+    size_t k = 0;
+    // one worker sufficient, as test runner
+    hipTestWorker worker;
+    worker.set_name(action_name);
+    worker.set_path(m_transfer_type);
+    worker.start();
+    worker.join();
+
+    return rvs::lp::Stopping() ? false : true;
+}
+
+/**
+ * @brief reads all GST-related configuration keys from
+ * the module's properties collection
+ * @return true if no fatal error occured, false otherwise
+ */
+bool tfb_action::get_all_tfb_config_keys(void) {
+    int error;
+    string msg;
+    bool bsts = true;
+
+    if (property_get<std::string>(RVS_CONF_TFB_TYPE_KEY, &m_transfer_type,
+            TFB_DEFAULT_TYPE)) {
+         msg = "invalid '" +
+         std::string(RVS_CONF_GST_OPS_TYPE) + "' key value";
+         rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
+         bsts = false;
+    }
+
+    return bsts;
+}
+
+/**
+ * @brief reads all common configuration keys from
+ * the module's properties collection
+ * @return true if no fatal error occured, false otherwise
+ */
+bool tfb_action::get_all_common_config_keys(void) {
+    string msg, sdevid, sdev;
+	bsts = false;
+    }
+
+    return bsts;
+}
+
+/**
+ * @brief reads all common configuration keys from
+ * the module's properties collection
+ * @return true if no fatal error occured, false otherwise
+ */
+bool tfb_action::get_all_common_config_keys(void) {
+    string msg, sdevid, sdev;
+    int error;
+    bool bsts = true;
+
+
+    // place holder for later
+
+    return bsts;
+}
+
+/**
+ * @brief gets the number of ROCm compatible AMD GPUs
+ * @return run number of GPUs
+ */
+int tfb_action::get_num_amd_gpu_devices(void) {
+    int hip_num_gpu_devices;
+    string msg;
+
+    hipGetDeviceCount(&hip_num_gpu_devices);
+    if (hip_num_gpu_devices == 0) {  // no AMD compatible GPU
+        msg = action_name + " " + MODULE_NAME + " " + HIPTEST_NO_COMPATIBLE_GPUS;
+        rvs::lp::Log(msg, rvs::logerror);
+
+        if (bjson) {
+            unsigned int sec;
+            unsigned int usec;
+            rvs::lp::get_ticks(&sec, &usec);
+            void *json_root_node = rvs::lp::LogRecordCreate(MODULE_NAME,
+                            action_name.c_str(), rvs::loginfo, sec, usec, true);
+            if (!json_root_node) {
+                // log the error
+                string msg = std::string(JSON_CREATE_NODE_ERROR);
+                rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
+                return -1;
+            }
+
+            rvs::lp::AddString(json_root_node, "ERROR", HIPTEST_NO_COMPATIBLE_GPUS);
+            rvs::lp::LogRecordFlush(json_root_node, rvs::loginfo);
+        }
+        return 0;
+    }
+    return hip_num_gpu_devices;
+}
+
+/**
+ * @brief gets all selected GPUs and starts the worker threads
+ * @return run result
+ */
+int tfb_action::run_transferbench(void) {
+    int hip_num_gpu_devices;
+    std::string msg;
+
+    hip_num_gpu_devices = get_num_amd_gpu_devices();
+     
+    if (hip_num_gpu_devices > 0) {
+        if (start_tfb_runners())
+            return 0;
+
+        return -1;
+    } else {
+      msg = "No devices match criteria from the test configuation.";
+      rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
+      return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * @brief runs the whole GST logic
+ * @return run result
+ */
+int tfb_action::run(void) {
+    string msg;
+
+    // get the action name
+    if (property_get(RVS_CONF_NAME_KEY, &action_name)) {
+      rvs::lp::Err("Action name missing", MODULE_NAME_CAPS);
+      return -1;
+    }
+
+    // check for -j flag (json logging)
+    if (property.find("cli.-j") != property.end())
+        bjson = true;
+    if (!get_all_common_config_keys())
+        return -1;
+    if (!get_all_tfb_config_keys())
+        return -1;
+
+    return run_transferbench();
+}
diff --git a/transferbench.so/src/hiptest_worker.cpp b/transferbench.so/src/hiptest_worker.cpp
new file mode 100644
index 00000000..9a31fc32
--- /dev/null
+++ b/transferbench.so/src/hiptest_worker.cpp
@@ -0,0 +1,96 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include/hiptest_worker.h"
+
+#include <unistd.h>
+#include <string>
+#include <memory>
+#include <iostream>
+#include "include/rvs_blas.h"
+#include "include/rvs_module.h"
+#include "include/rvsloglp.h"
+#include "include/rvs_util.h"
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+
+#define MODULE_NAME                             "hiptest"
+
+using std::string;
+
+//bool hipTestWorker::bjson = false;
+
+hipTestWorker::hipTestWorker() {}
+hipTestWorker::~hipTestWorker() {}
+
+
+/**
+ * @brief performs the stress test on the given GPU
+ */
+void hipTestWorker::run() {
+    string msg, err_description;
+    int error = 0;
+
+
+    // log GST stress test - start message
+    msg = "[" + action_name + "] " + MODULE_NAME + " " +
+            " Starting the Hip test "; 
+    rvs::lp::Log(msg, rvs::logtrace);
+
+    // let the GPU ramp-up and check the result
+    bool hipsuccess = start_hip_tests(error, err_description);
+
+    // GPU was not able to do the processing (HIP/rocBlas error(s) occurred)
+    if (error) {
+        string msg = "[" + action_name + "] " + MODULE_NAME + " "
+                         + err_description;
+        rvs::lp::Log(msg, rvs::logerror);
+        return;
+    }
+
+}
+
+/**
+ * @brief forks and execs test result
+ * @param return true if test succeeded, false otherwise
+ */
+
+bool hipTestWorker::start_hip_tests(int &error, string &errdesc){
+    int pid, status;
+    auto found = m_test_path.find_last_of('/');
+    auto fname = m_test_path.substr(found+1);
+    if((pid = fork()) == 0){ // child
+	execl(m_test_path.c_str(), fname.c_str(), m_test_args.c_str(), 0);
+    }else{
+	waitpid(pid, &status, 0);
+	if (WIFEXITED(status)){
+	    error = 0;
+	    return true;
+	}
+	return false;
+    }
+}
diff --git a/transferbench.so/src/rvs_module.cpp b/transferbench.so/src/rvs_module.cpp
new file mode 100644
index 00000000..3fbec70c
--- /dev/null
+++ b/transferbench.so/src/rvs_module.cpp
@@ -0,0 +1,91 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include/rvs_module.h"
+#include "include/action.h"
+#include "include/rvsloglp.h"
+#include "include/gpu_util.h"
+
+/**
+ * @defgroup GST GST Module
+ *
+ * @brief performs GPU Stress Test
+ *
+ * The GPU Stress Test runs a Graphics Stress test or SGEMM/DGEMM
+ * (Single/Double-precision General Matrix Multiplication) workload
+ * on one, some or all GPUs. The GPUs can be of the same or different types.
+ * The duration of the benchmark should be configurable, both in terms of time
+ * (how long to run) and iterations (how many times to run).
+ * 
+ */
+
+extern "C" int rvs_module_has_interface(int iid) {
+  int sts = 0;
+  switch (iid) {
+  case 0:
+  case 1:
+    sts = 1;
+  }
+  return sts;
+}
+
+extern "C" const char* rvs_module_get_description(void) {
+    return "ROCm Validation Suite HipTest module";
+}
+
+extern "C" const char* rvs_module_get_config(void) {
+    return "test_path (string)" ;
+}
+
+extern "C" const char* rvs_module_get_output(void) {
+    return "pass (bool)";
+}
+
+extern "C" int rvs_module_init(void* pMi) {
+    rvs::lp::Initialize(static_cast<T_MODULE_INIT*>(pMi));
+    rvs::gpulist::Initialize();
+    return 0;
+}
+
+extern "C" int rvs_module_terminate(void) {
+    return 0;
+}
+
+extern "C" void* rvs_module_action_create(void) {
+    return static_cast<void*>(new hiptest_action);
+}
+
+extern "C" int   rvs_module_action_destroy(void* pAction) {
+    delete static_cast<rvs::actionbase*>(pAction);
+    return 0;
+}
+
+extern "C" int rvs_module_action_property_set(void* pAction, const char* Key,
+                                                            const char* Val) {
+    return static_cast<rvs::actionbase*>(pAction)->property_set(Key, Val);
+}
+
+extern "C" int rvs_module_action_run(void* pAction) {
+    return static_cast<rvs::actionbase*>(pAction)->run();
+}
diff --git a/transferbench.so/src/tfb_worker.cpp b/transferbench.so/src/tfb_worker.cpp
new file mode 100644
index 00000000..75b926d9
--- /dev/null
+++ b/transferbench.so/src/tfb_worker.cpp
@@ -0,0 +1,1953 @@
+/*
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// This program measures simultaneous copy performance across multiple GPUs
+// on the same node
+#include <numa.h>
+#include <numaif.h>
+#include <random>
+#include <stack>
+#include <thread>
+
+#include "TransferBench.hpp"
+#include "GetClosestNumaNode.hpp"
+
+
+/**
+ * @brief performs the stress test on the given GPU
+ */
+void tfbWorker::run() {
+    string msg, err_description;
+    int error = 0;
+
+
+    // log GST stress test - start message
+    msg = "[" + action_name + "] " + MODULE_NAME + " " +
+            " Starting the Hip test ";
+    rvs::lp::Log(msg, rvs::logtrace);
+
+    // let the GPU ramp-up and check the result
+   int sts = TfbRun(error, err_description);
+
+    // GPU was not able to do the processing (HIP/rocBlas error(s) occurred)
+    if (error) {
+        string msg = "[" + action_name + "] " + MODULE_NAME + " "
+                         + err_description;
+        rvs::lp::Log(msg, rvs::logerror);
+        return;
+    }
+
+}
+
+
+int TfbRun(int &err, string &errmsg)
+{
+  // Check for NUMA library support
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+
+  // Display usage instructions and detected topology
+  if (argc <= 1)
+  {
+    int const outputToCsv = EnvVars::GetEnvVar("OUTPUT_TO_CSV", 0);
+    if (!outputToCsv) DisplayUsage(argv[0]);
+    DisplayTopology(outputToCsv);
+    exit(0);
+  }
+
+  // Collect environment variables / display current run configuration
+  EnvVars ev;
+
+  // Determine number of bytes to run per Transfer
+  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
+  if (argc > 2)
+  {
+    // Adjust bytes if unit specified
+    char units = argv[2][strlen(argv[2])-1];
+    switch (units)
+    {
+    case 'K': case 'k': numBytesPerTransfer *= 1024; break;
+    case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
+    case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
+    }
+  }
+  if (numBytesPerTransfer % 4)
+  {
+    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
+    exit(1);
+  }
+
+  // Check for preset tests
+  // - Tests that sweep across possible sets of Transfers
+  if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
+  {
+    int numGpuSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
+    int numCpuSubExecs = (argc > 4 ? atoi(argv[4]) : 4);
+
+    ev.configMode = CFG_SWEEP;
+    RunSweepPreset(ev, numBytesPerTransfer, numGpuSubExecs, numCpuSubExecs, !strcmp(argv[1], "rsweep"));
+    exit(0);
+  }
+  // - Tests that benchmark peer-to-peer performance
+  else if (!strcmp(argv[1], "p2p"))
+  {
+    ev.configMode = CFG_P2P;
+    RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float));
+    exit(0);
+  }
+
+  // Check that Transfer configuration file can be opened
+  ev.configMode = CFG_FILE;
+  FILE* fp = fopen(argv[1], "r");
+  if (!fp)
+  {
+    printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
+    exit(1);
+  }
+
+  // Print environment variables and CSV header
+  ev.DisplayEnvVars();
+  if (ev.outputToCsv)
+  {
+    printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
+  }
+
+  int testNum = 0;
+  char line[2048];
+  while(fgets(line, 2048, fp))
+  {
+    // Check if line is a comment to be echoed to output (starts with ##)
+    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);
+
+    // Parse set of parallel Transfers to execute
+    std::vector<Transfer> transfers;
+    ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
+    if (transfers.empty()) continue;
+
+    // If the number of bytes is specified, use it
+    if (numBytesPerTransfer != 0)
+    {
+      size_t N = numBytesPerTransfer / sizeof(float);
+      ExecuteTransfers(ev, ++testNum, N, transfers);
+    }
+    else
+    {
+      // Otherwise generate a range of values
+      for (int N = 256; N <= (1<<27); N *= 2)
+      {
+        int delta = std::max(1, N / ev.samplingFactor);
+        int curr = N;
+        while (curr < N * 2)
+        {
+          ExecuteTransfers(ev, ++testNum, curr, transfers);
+          curr += delta;
+        }
+      }
+    }
+  }
+  fclose(fp);
+
+  return 0;
+}
+
+void ExecuteTransfers(EnvVars const& ev,
+                      int const testNum,
+                      size_t const N,
+                      std::vector<Transfer>& transfers,
+                      bool verbose)
+{
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  // Map transfers by executor
+  TransferMap transferMap;
+  for (Transfer& transfer : transfers)
+  {
+    Executor executor(transfer.exeType, transfer.exeIndex);
+    ExecutorInfo& executorInfo = transferMap[executor];
+    executorInfo.transfers.push_back(&transfer);
+  }
+
+  // Loop over each executor and prepare sub-executors
+  std::map<int, Transfer*> transferList;
+  for (auto& exeInfoPair : transferMap)
+  {
+    Executor const& executor = exeInfoPair.first;
+    ExecutorInfo& exeInfo    = exeInfoPair.second;
+    ExeType const exeType    = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
+
+    exeInfo.totalTime = 0.0;
+    exeInfo.totalSubExecs = 0;
+
+    // Loop over each transfer this executor is involved in
+    for (Transfer* transfer : exeInfo.transfers)
+    {
+      // Determine how many bytes to copy for this Transfer (use custom if pre-specified)
+      transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));
+
+      // Allocate source memory
+      transfer->srcMem.resize(transfer->numSrcs);
+      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
+      {
+        MemType const& srcType  = transfer->srcType[iSrc];
+        int     const  srcIndex    = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
+
+        // Ensure executing GPU can access source memory
+        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex)
+          EnablePeerAccess(exeIndex, srcIndex);
+
+        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
+      }
+
+      // Allocate destination memory
+      transfer->dstMem.resize(transfer->numDsts);
+      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
+      {
+        MemType const& dstType  = transfer->dstType[iDst];
+        int     const  dstIndex    = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
+
+        // Ensure executing GPU can access destination memory
+        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex)
+          EnablePeerAccess(exeIndex, dstIndex);
+
+        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
+      }
+
+      exeInfo.totalSubExecs += transfer->numSubExecs;
+      transferList[transfer->transferIndex] = transfer;
+    }
+
+    // Prepare additional requirement for GPU-based executors
+    if (IsGpuType(exeType))
+    {
+      HIP_CALL(hipSetDevice(exeIndex));
+
+      // Single-stream is only supported for GFX-based executors
+      int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
+      exeInfo.streams.resize(numStreamsToUse);
+      exeInfo.startEvents.resize(numStreamsToUse);
+      exeInfo.stopEvents.resize(numStreamsToUse);
+      for (int i = 0; i < numStreamsToUse; ++i)
+      {
+        HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
+      }
+
+      if (exeType == EXE_GPU_GFX)
+      {
+        // Allocate one contiguous chunk of GPU memory for threadblock parameters
+        // This allows support for executing one transfer per stream, or all transfers in a single stream
+        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
+                       (void**)&exeInfo.subExecParamGpu);
+      }
+    }
+  }
+
+  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
+
+  // Prepare input memory and block parameters for current N
+  bool isSrcCorrect = true;
+  for (auto& exeInfoPair : transferMap)
+  {
+    Executor const& executor = exeInfoPair.first;
+    ExecutorInfo& exeInfo    = exeInfoPair.second;
+    ExeType const exeType    = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
+
+    exeInfo.totalBytes = 0;
+
+    int transferOffset = 0;
+    for (int i = 0; i < exeInfo.transfers.size(); ++i)
+    {
+      // Prepare subarrays each threadblock works on and fill src memory with patterned data
+      Transfer* transfer = exeInfo.transfers[i];
+      transfer->PrepareSubExecParams(ev);
+      isSrcCorrect &= transfer->PrepareSrc(ev);
+      exeInfo.totalBytes += transfer->numBytesActual;
+
+      // Copy block parameters to GPU for GPU executors
+      if (transfer->exeType == EXE_GPU_GFX)
+      {
+        exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
+        HIP_CALL(hipSetDevice(exeIndex));
+        HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset],
+                           transfer->subExecParam.data(),
+                           transfer->subExecParam.size() * sizeof(SubExecParam),
+                           hipMemcpyHostToDevice));
+        HIP_CALL(hipDeviceSynchronize());
+
+        transferOffset += transfer->subExecParam.size();
+      }
+    }
+  }
+
+  // Launch kernels (warmup iterations are not counted)
+  double totalCpuTime = 0;
+  size_t numTimedIterations = 0;
+  std::stack<std::thread> threads;
+  for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
+  {
+    if (ev.numIterations > 0 && iteration    >= ev.numIterations) break;
+    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
+
+    // Pause before starting first timed iteration in interactive mode
+    if (verbose && ev.useInteractive && iteration == 0)
+    {
+      printf("Memory prepared:\n");
+
+      for (Transfer& transfer : transfers)
+      {
+        printf("Transfer %03d:\n", transfer.transferIndex);
+        for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
+          printf("  SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
+        for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
+          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
+      }
+      printf("Hit <Enter> to continue: ");
+      if (scanf("%*c") != 0)
+      {
+        printf("[ERROR] Unexpected input\n");
+        exit(1);
+      }
+      printf("\n");
+    }
+
+    // Start CPU timing for this iteration
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Execute all Transfers in parallel
+    for (auto& exeInfoPair : transferMap)
+    {
+      ExecutorInfo& exeInfo = exeInfoPair.second;
+      ExeType       exeType = exeInfoPair.first.first;
+      int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();
+
+      for (int i = 0; i < numTransfersToRun; ++i)
+        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
+    }
+
+    // Wait for all threads to finish
+    int const numTransfers = threads.size();
+    for (int i = 0; i < numTransfers; i++)
+    {
+      threads.top().join();
+      threads.pop();
+    }
+
+    // Stop CPU timing for this iteration
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+
+    if (iteration >= 0)
+    {
+      ++numTimedIterations;
+      totalCpuTime += deltaSec;
+    }
+  }
+
+  // Pause for interactive mode
+  if (verbose && isSrcCorrect && ev.useInteractive)
+  {
+    printf("Transfers complete. Hit <Enter> to continue: ");
+    if (scanf("%*c") != 0)
+    {
+      printf("[ERROR] Unexpected input\n");
+      exit(1);
+    }
+    printf("\n");
+  }
+
+  // Validate that each transfer has transferred correctly
+  size_t totalBytesTransferred = 0;
+  int const numTransfers = transferList.size();
+  for (auto transferPair : transferList)
+  {
+    Transfer* transfer = transferPair.second;
+    transfer->ValidateDst(ev);
+    totalBytesTransferred += transfer->numBytesActual;
+  }
+
+  // Report timings
+  totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
+  double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
+  double maxGpuTime = 0;
+
+  if (!isSrcCorrect) goto cleanup;
+  if (ev.useSingleStream)
+  {
+    for (auto& exeInfoPair : transferMap)
+    {
+      ExecutorInfo  exeInfo  = exeInfoPair.second;
+      ExeType const exeType  = exeInfoPair.first.first;
+      int     const exeIndex = exeInfoPair.first.second;
+
+      // Compute total time for non GPU executors
+      if (exeType != EXE_GPU_GFX)
+      {
+        exeInfo.totalTime = 0;
+        for (auto const& transfer : exeInfo.transfers)
+          exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
+      }
+
+      double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
+      double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
+
+      if (verbose && !ev.outputToCsv)
+      {
+        printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n",
+               ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
+      }
+
+      int totalCUs = 0;
+      for (auto const& transfer : exeInfo.transfers)
+      {
+        double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+        double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+        totalCUs += transfer->numSubExecs;
+
+        if (!verbose) continue;
+        if (!ev.outputToCsv)
+        {
+          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+                 transfer->transferIndex,
+                 transferBandwidthGbs,
+                 transferDurationMsec,
+                 transfer->numBytesActual,
+                 transfer->SrcToStr().c_str(),
+                 ExeTypeName[transfer->exeType], transfer->exeIndex,
+                 transfer->numSubExecs,
+                 transfer->DstToStr().c_str());
+        }
+        else
+        {
+          printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+                 testNum, transfer->transferIndex, transfer->numBytesActual,
+                 transfer->SrcToStr().c_str(),
+                 MemTypeStr[transfer->exeType], transfer->exeIndex,
+                 transfer->DstToStr().c_str(),
+                 transfer->numSubExecs,
+                 transferBandwidthGbs, transferDurationMsec,
+                 PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
+                 PtrVectorToStr(transfer->dstMem, initOffset).c_str());
+        }
+      }
+
+      if (verbose && ev.outputToCsv)
+      {
+        printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n",
+               testNum, totalBytesTransferred,
+               MemTypeStr[exeType], exeIndex, totalCUs,
+               exeBandwidthGbs, exeDurationMsec);
+      }
+    }
+  }
+  else
+  {
+    for (auto const& transferPair : transferList)
+    {
+      Transfer* transfer = transferPair.second;
+      double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+      double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
+      if (!verbose) continue;
+      if (!ev.outputToCsv)
+      {
+        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+               transfer->transferIndex,
+               transferBandwidthGbs, transferDurationMsec,
+               transfer->numBytesActual,
+               transfer->SrcToStr().c_str(),
+               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               transfer->numSubExecs,
+               transfer->DstToStr().c_str());
+      }
+      else
+      {
+        printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+               testNum, transfer->transferIndex, transfer->numBytesActual,
+               transfer->SrcToStr().c_str(),
+               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               transfer->DstToStr().c_str(),
+               transfer->numSubExecs,
+               transferBandwidthGbs, transferDurationMsec,
+               PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
+               PtrVectorToStr(transfer->dstMem, initOffset).c_str());
+      }
+    }
+  }
+
+  // Display aggregate statistics
+  if (verbose)
+  {
+    if (!ev.outputToCsv)
+    {
+      printf(" Aggregate (CPU)  | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
+             totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
+    }
+    else
+    {
+      printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n",
+             testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
+    }
+  }
+
+  // Release GPU memory
+cleanup:
+  for (auto exeInfoPair : transferMap)
+  {
+    ExecutorInfo& exeInfo  = exeInfoPair.second;
+    ExeType const exeType  = exeInfoPair.first.first;
+    int     const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));
+
+    for (auto& transfer : exeInfo.transfers)
+    {
+      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
+      {
+        MemType const& srcType = transfer->srcType[iSrc];
+        DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
+      }
+      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
+      {
+        MemType const& dstType = transfer->dstType[iDst];
+        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
+      }
+      transfer->subExecParam.clear();
+    }
+
+    if (IsGpuType(exeType))
+    {
+      int const numStreams = (int)exeInfo.streams.size();
+      for (int i = 0; i < numStreams; ++i)
+      {
+        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
+        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
+        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
+      }
+
+      if (exeType == EXE_GPU_GFX)
+      {
+        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
+      }
+    }
+  }
+}
+
+void DisplayUsage(char const* cmdName)
+{
+  printf("TransferBench v%s\n", TB_VERSION);
+  printf("========================================\n");
+
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+  int const numCpuDevices = numa_num_configured_nodes();
+
+  printf("Usage: %s config <N>\n", cmdName);
+  printf("  config: Either:\n");
+  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
+  printf("          - Name of preset config:\n");
+  printf("              p2p          - Peer-to-peer benchmark tests\n");
+  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
+  printf("                             - 3rd/4th optional args for # GPU SubExecs / # CPU SubExecs per Transfer\n");
+  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
+  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
+         DEFAULT_BYTES_PER_TRANSFER);
+  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
+  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
+  printf("\n");
+
+  EnvVars::DisplayUsage();
+}
+
+int RemappedIndex(int const origIdx, bool const isCpuType)
+{
+  static std::vector<int> remappingCpu;
+  static std::vector<int> remappingGpu;
+
+  // Build CPU remapping on first use
+  // Skip numa nodes that are not configured
+  if (remappingCpu.empty())
+  {
+    for (int node = 0; node <= numa_max_node(); node++)
+      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
+        remappingCpu.push_back(node);
+  }
+
+  // Build remappingGpu on first use
+  if (remappingGpu.empty())
+  {
+    int numGpuDevices;
+    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+    remappingGpu.resize(numGpuDevices);
+
+    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
+    if (!usePcieIndexing)
+    {
+      // For HIP-based indexing no remappingGpu is necessary
+      for (int i = 0; i < numGpuDevices; ++i)
+        remappingGpu[i] = i;
+    }
+    else
+    {
+      // Collect PCIe address for each GPU
+      std::vector<std::pair<std::string, int>> mapping;
+      char pciBusId[20];
+      for (int i = 0; i < numGpuDevices; ++i)
+      {
+        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
+        mapping.push_back(std::make_pair(pciBusId, i));
+      }
+      // Sort GPUs by PCIe address then use that as mapping
+      std::sort(mapping.begin(), mapping.end());
+      for (int i = 0; i < numGpuDevices; ++i)
+        remappingGpu[i] = mapping[i].second;
+    }
+  }
+  return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
+}
+
+void DisplayTopology(bool const outputToCsv)
+{
+
+  int numCpuDevices = numa_num_configured_nodes();
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+
+  if (outputToCsv)
+  {
+    printf("NumCpus,%d\n", numCpuDevices);
+    printf("NumGpus,%d\n", numGpuDevices);
+  }
+  else
+  {
+    printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total]   %d GPU device(s)\n",
+           numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
+  }
+
+  // Print out detected CPU topology
+  if (outputToCsv)
+  {
+    printf("NUMA");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf(",NUMA%02d", j);
+    printf(",# CPUs,ClosestGPUs,ActualNode\n");
+  }
+  else
+  {
+    printf("            |");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf("NUMA %02d|", j);
+    printf(" #Cpus | Closest GPU(s)\n");
+
+    printf("------------+");
+    for (int j = 0; j <= numCpuDevices; j++)
+      printf("-------+");
+    printf("---------------\n");
+  }
+
+  for (int i = 0; i < numCpuDevices; i++)
+  {
+    int nodeI = RemappedIndex(i, true);
+    printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
+    for (int j = 0; j < numCpuDevices; j++)
+    {
+      int nodeJ = RemappedIndex(j, true);
+      int numaDist = numa_distance(nodeI, nodeJ);
+      if (outputToCsv)
+        printf("%d,", numaDist);
+      else
+        printf(" %5d |", numaDist);
+    }
+
+    int numCpus = 0;
+    for (int j = 0; j < numa_num_configured_cpus(); j++)
+      if (numa_node_of_cpu(j) == nodeI) numCpus++;
+    if (outputToCsv)
+      printf("%d,", numCpus);
+    else
+      printf(" %5d | ", numCpus);
+
+#if !defined(__NVCC__)
+    bool isFirst = true;
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
+      {
+        if (isFirst) isFirst = false;
+        else printf(",");
+        printf("%d", j);
+      }
+    }
+#endif
+    printf("\n");
+  }
+  printf("\n");
+
+#if defined(__NVCC__)
+  // No further topology detection done for NVIDIA platforms
+  return;
+#endif
+
+  // Print out detected GPU topology
+  if (outputToCsv)
+  {
+    printf("GPU");
+    for (int j = 0; j < numGpuDevices; j++)
+      printf(",GPU %02d", j);
+    printf(",PCIe Bus ID,ClosestNUMA\n");
+  }
+  else
+  {
+    printf("        |");
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, j));
+      std::string fullName = prop.gcnArchName;
+      std::string archName = fullName.substr(0, fullName.find(':'));
+      printf(" %6s |", archName.c_str());
+    }
+    printf("\n");
+    printf("        |");
+    for (int j = 0; j < numGpuDevices; j++)
+      printf(" GPU %02d |", j);
+    printf(" PCIe Bus ID  | #CUs | Closest NUMA\n");
+    for (int j = 0; j <= numGpuDevices; j++)
+      printf("--------+");
+    printf("--------------+------+-------------\n");
+  }
+
+#if !defined(__NVCC__)
+  char pciBusId[20];
+  for (int i = 0; i < numGpuDevices; i++)
+  {
+    int const deviceIdx = RemappedIndex(i, false);
+    printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (i == j)
+      {
+        if (outputToCsv)
+          printf("-,");
+        else
+          printf("    -   |");
+      }
+      else
+      {
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
+                                              RemappedIndex(j, false),
+                                              &linkType, &hopCount));
+        printf("%s%s-%d%s",
+               outputToCsv ? "" : " ",
+               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
+               hopCount, outputToCsv ? "," : " |");
+      }
+    }
+    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));
+
+    int numDeviceCUs = 0;
+    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
+
+    if (outputToCsv)
+      printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+    else
+      printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+  }
+#endif
+}
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
+                  std::vector<MemType>& memTypes, std::vector<int>& memIndices)
+{
+  char typeChar;
+  int offset = 0, devIndex, inc;
+  bool found = false;
+
+  memTypes.clear();
+  memIndices.clear();
+  while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
+  {
+    offset += inc;
+    MemType memType = CharToMemType(typeChar);
+
+    if (IsCpuType(memType) && (devIndex < 0 || devIndex >= numCpus))
+    {
+      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, devIndex);
+      exit(1);
+    }
+    if (IsGpuType(memType) && (devIndex < 0 || devIndex >= numGpus))
+    {
+      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, devIndex);
+      exit(1);
+    }
+
+    found = true;
+    if (memType != MEM_NULL)
+    {
+      memTypes.push_back(memType);
+      memIndices.push_back(devIndex);
+    }
+  }
+  if (!found)
+  {
+    printf("[ERROR] Unable to parse memory type token %s.  Expected one of %s followed by an index\n",
+           token.c_str(), MemTypeStr);
+    exit(1);
+  }
+}
+
+void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
+                  ExeType &exeType, int& exeIndex)
+{
+  char typeChar;
+  if (sscanf(token.c_str(), " %c%d", &typeChar, &exeIndex) != 2)
+  {
+    printf("[ERROR] Unable to parse valid executor token (%s).  Exepected one of %s followed by an index\n",
+           token.c_str(), ExeTypeStr);
+    exit(1);
+  }
+  exeType = CharToExeType(typeChar);
+
+  if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= numCpus))
+  {
+    printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, exeIndex);
+    exit(1);
+  }
+  if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= numGpus))
+  {
+    printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, exeIndex);
+    exit(1);
+  }
+}
+
+// Helper function to parse a list of Transfer definitions
+void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
+{
+  // Replace any round brackets or '->' with spaces,
+  for (int i = 1; line[i]; i++)
+    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
+
+  transfers.clear();
+
+  int numTransfers = 0;
+  std::istringstream iss(line);
+  iss >> numTransfers;
+  if (iss.fail()) return;
+
+  std::string exeMem;
+  std::string srcMem;
+  std::string dstMem;
+
+  // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
+  // otherwise read triples (srcMem, exeMem, dstMem)
+  bool const advancedMode = (numTransfers < 0);
+  numTransfers = abs(numTransfers);
+
+  int numSubExecs;
+  if (!advancedMode)
+  {
+    iss >> numSubExecs;
+    if (numSubExecs <= 0 || iss.fail())
+    {
+      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs);
+      exit(1);
+    }
+  }
+
+  size_t numBytes = 0;
+  for (int i = 0; i < numTransfers; i++)
+  {
+    Transfer transfer;
+    transfer.transferIndex = i;
+    transfer.numBytes = 0;
+    transfer.numBytesActual = 0;
+    if (!advancedMode)
+    {
+      iss >> srcMem >> exeMem >> dstMem;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
+        exit(1);
+      }
+    }
+    else
+    {
+      std::string numBytesToken;
+      iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
+        exit(1);
+      }
+      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
+      {
+        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
+        exit(1);
+      }
+      char units = numBytesToken.back();
+      switch (toupper(units))
+      {
+      case 'K': numBytes *= 1024; break;
+      case 'M': numBytes *= 1024*1024; break;
+      case 'G': numBytes *= 1024*1024*1024; break;
+      }
+    }
+
+    ParseMemType(srcMem, numCpus, numGpus, transfer.srcType, transfer.srcIndex);
+    ParseMemType(dstMem, numCpus, numGpus, transfer.dstType, transfer.dstIndex);
+    ParseExeType(exeMem, numCpus, numGpus, transfer.exeType, transfer.exeIndex);
+
+    transfer.numSrcs = (int)transfer.srcType.size();
+    transfer.numDsts = (int)transfer.dstType.size();
+    if (transfer.numSrcs == 0 && transfer.numDsts == 0)
+    {
+      printf("[ERROR] Transfer must have at least one src or dst\n");
+      exit(1);
+    }
+
+    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
+    {
+      printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
+      exit(1);
+    }
+
+    transfer.numSubExecs = numSubExecs;
+    transfer.numBytes = numBytes;
+    transfers.push_back(transfer);
+  }
+}
+
+void EnablePeerAccess(int const deviceId, int const peerDeviceId)
+{
+  int canAccess;
+  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
+  if (!canAccess)
+  {
+    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
+    exit(1);
+  }
+  HIP_CALL(hipSetDevice(deviceId));
+  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
+  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
+  {
+    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
+           deviceId, peerDeviceId, hipGetErrorString(error));
+    exit(1);
+  }
+}
+
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
+{
+  if (numBytes == 0)
+  {
+    printf("[ERROR] Unable to allocate 0 bytes\n");
+    exit(1);
+  }
+  *memPtr = nullptr;
+  if (IsCpuType(memType))
+  {
+    // Set numa policy prior to call to hipHostMalloc
+    numa_set_preferred(devIndex);
+
+    // Allocate host-pinned memory (should respect NUMA mem policy)
+    if (memType == MEM_CPU_FINE)
+    {
+#if defined (__NVCC__)
+      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
+      exit(1);
+#else
+      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+#endif
+    }
+    else if (memType == MEM_CPU)
+    {
+#if defined (__NVCC__)
+      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
+#else
+      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
+#endif
+      {
+        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
+        exit(1);
+      }
+    }
+    else if (memType == MEM_CPU_UNPINNED)
+    {
+      *memPtr = numa_alloc_onnode(numBytes, devIndex);
+    }
+
+    // Check that the allocated pages are actually on the correct NUMA node
+    memset(*memPtr, 0, numBytes);
+    CheckPages((char*)*memPtr, numBytes, devIndex);
+
+    // Reset to default numa mem policy
+    numa_set_preferred(-1);
+  }
+  else if (IsGpuType(memType))
+  {
+    if (memType == MEM_GPU)
+    {
+      // Allocate GPU memory on appropriate device
+      HIP_CALL(hipSetDevice(devIndex));
+      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
+    }
+    else if (memType == MEM_GPU_FINE)
+    {
+#if defined (__NVCC__)
+      printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
+      exit(1);
+#else
+      HIP_CALL(hipSetDevice(devIndex));
+      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
+
+#endif
+    }
+    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
+    HIP_CALL(hipDeviceSynchronize());
+  }
+  else
+  {
+    printf("[ERROR] Unsupported memory type %d\n", memType);
+    exit(1);
+  }
+}
+
+void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
+{
+  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null CPU pointer for %lu bytes.  Skipping hipHostFree\n", bytes);
+      return;
+    }
+    HIP_CALL(hipHostFree(memPtr));
+  }
+  else if (memType == MEM_CPU_UNPINNED)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes.  Skipping numa_free\n", bytes);
+      return;
+    }
+    numa_free(memPtr, bytes);
+  }
+  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
+      return;
+    }
+    HIP_CALL(hipFree(memPtr));
+  }
+}
+
+void CheckPages(char* array, size_t numBytes, int targetId)
+{
+  unsigned long const pageSize = getpagesize();
+  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
+
+  std::vector<void *> pages(numPages);
+  std::vector<int> status(numPages);
+
+  pages[0] = array;
+  for (int i = 1; i < numPages; i++)
+  {
+    pages[i] = (char*)pages[i-1] + pageSize;
+  }
+
+  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+  if (retCode)
+  {
+    printf("[ERROR] Unable to collect page info\n");
+    exit(1);
+  }
+
+  size_t mistakeCount = 0;
+  for (int i = 0; i < numPages; i++)
+  {
+    if (status[i] < 0)
+    {
+      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
+      exit(1);
+    }
+    if (status[i] != targetId) mistakeCount++;
+  }
+  if (mistakeCount > 0)
+  {
+    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
+    exit(1);
+  }
+}
+
+void RunTransfer(EnvVars const& ev, int const iteration,
+                 ExecutorInfo& exeInfo, int const transferIdx)
+{
+  Transfer* transfer = exeInfo.transfers[transferIdx];
+
+  if (transfer->exeType == EXE_GPU_GFX)
+  {
+    // Switch to executing GPU
+    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
+    HIP_CALL(hipSetDevice(exeIndex));
+
+    hipStream_t& stream     = exeInfo.streams[transferIdx];
+    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
+
+    // Figure out how many threadblocks to use.
+    // In single stream mode, all the threadblocks for this GPU are launched
+    // Otherwise, just launch the threadblocks associated with this single Transfer
+    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
+#if defined(__NVCC__)
+    HIP_CALL(hipEventRecord(startEvent, stream));
+    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
+    HIP_CALL(hipEventRecord(stopEvent, stream));
+#else
+    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
+                          dim3(numBlocksToRun, 1, 1),
+                          dim3(BLOCKSIZE, 1, 1),
+                          ev.sharedMemBytes, stream,
+                          startEvent, stopEvent,
+                          0, transfer->subExecParamGpuPtr);
+#endif
+    // Synchronize per iteration, unless in single sync mode, in which case
+    // synchronize during last warmup / last actual iteration
+    HIP_CALL(hipStreamSynchronize(stream));
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      float gpuDeltaMsec;
+      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+
+      if (ev.useSingleStream)
+      {
+        // Figure out individual timings for Transfers that were all launched together
+        for (Transfer* currTransfer : exeInfo.transfers)
+        {
+          long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle;
+          long long maxStopCycle  = currTransfer->subExecParamGpuPtr[0].stopCycle;
+          for (int i = 1; i < currTransfer->numSubExecs; i++)
+          {
+            minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle);
+            maxStopCycle  = std::max(maxStopCycle,  currTransfer->subExecParamGpuPtr[i].stopCycle);
+          }
+          int const wallClockRate = GetWallClockRate(exeIndex);
+          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
+          currTransfer->transferTime += iterationTimeMs;
+        }
+        exeInfo.totalTime += gpuDeltaMsec;
+      }
+      else
+      {
+        transfer->transferTime += gpuDeltaMsec;
+      }
+    }
+  }
+  else if (transfer->exeType == EXE_GPU_DMA)
+  {
+    // Switch to executing GPU
+    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
+    HIP_CALL(hipSetDevice(exeIndex));
+
+    hipStream_t& stream     = exeInfo.streams[transferIdx];
+    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
+
+    HIP_CALL(hipEventRecord(startEvent, stream));
+    if (transfer->numSrcs == 0 && transfer->numDsts == 1)
+    {
+      HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
+                              MEMSET_CHAR, transfer->numBytesActual, stream));
+    }
+    else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
+    {
+      HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
+                              transfer->numBytesActual, hipMemcpyDefault,
+                              stream));
+    }
+    HIP_CALL(hipEventRecord(stopEvent, stream));
+    HIP_CALL(hipStreamSynchronize(stream));
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      float gpuDeltaMsec;
+      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+      transfer->transferTime += gpuDeltaMsec;
+    }
+  }
+  else if (transfer->exeType == EXE_CPU) // CPU execution agent
+  {
+    // Force this thread and all child threads onto correct NUMA node
+    int const exeIndex = RemappedIndex(transfer->exeIndex, true);
+    if (numa_run_on_node(exeIndex))
+    {
+      printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
+      exit(1);
+    }
+
+    std::vector<std::thread> childThreads;
+
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Launch each subExecutor in child-threads to perform memcopies
+    for (int i = 0; i < transfer->numSubExecs; ++i)
+      childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
+
+    // Wait for child-threads to finish
+    for (int i = 0; i < transfer->numSubExecs; ++i)
+      childThreads[i].join();
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+
+    // Record time if not a warmup iteration
+    if (iteration >= 0)
+      transfer->transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+  }
+}
+
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
+{
+  ev.DisplayP2PBenchmarkEnvVars();
+
+  // Collect the number of available CPUs/GPUs on this machine
+  int const numCpus    = ev.numCpuDevices;
+  int const numGpus    = ev.numGpuDevices;
+  int const numDevices = numCpus + numGpus;
+
+  // Enable peer to peer for each GPU
+  for (int i = 0; i < numGpus; i++)
+    for (int j = 0; j < numGpus; j++)
+      if (i != j) EnablePeerAccess(i, j);
+
+  // Perform unidirectional / bidirectional
+  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
+  {
+    // Print header
+    if (!ev.outputToCsv)
+    {
+      printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
+             ev.useRemoteRead ? "Remote" : "Local",
+             ev.useRemoteRead ? "Local" : "Remote",
+             ev.useDmaCopy    ? "DMA"   : "GFX");
+
+      if (isBidirectional)
+      {
+        printf("%12s", "SRC\\DST");
+      }
+      else
+      {
+        if (ev.useRemoteRead)
+          printf("%12s", "SRC\\EXE+DST");
+        else
+          printf("%12s", "SRC+EXE\\DST");
+      }
+      for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i);
+      for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i);
+      printf("\n");
+    }
+
+    // Loop over all possible src/dst pairs
+    for (int src = 0; src < numDevices; src++)
+    {
+      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
+      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
+
+      if (!ev.outputToCsv)
+        printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
+
+      for (int dst = 0; dst < numDevices; dst++)
+      {
+        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
+        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
+
+        double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex);
+        if (!ev.outputToCsv)
+        {
+          if (bandwidth == 0)
+            printf("%10s", "N/A");
+          else
+            printf("%10.2f", bandwidth);
+        }
+        else
+        {
+          printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n",
+                 srcType == MEM_CPU ? "CPU" : "GPU", srcIndex,
+                 dstType == MEM_CPU ? "CPU" : "GPU", dstIndex,
+                 isBidirectional ? "bidirectional" : "unidirectional",
+                 ev.useRemoteRead ? "Remote" : "Local",
+                 ev.useDmaCopy ? "DMA" : "GFX",
+                 bandwidth,
+                 N * sizeof(float));
+        }
+        fflush(stdout);
+      }
+      if (!ev.outputToCsv) printf("\n");
+    }
+    if (!ev.outputToCsv) printf("\n");
+  }
+}
+
+double GetPeakBandwidth(EnvVars const& ev, size_t const N,
+                        int     const  isBidirectional,
+                        MemType const  srcType, int const srcIndex,
+                        MemType const  dstType, int const dstIndex)
+{
+  // Skip bidirectional on same device
+  if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
+
+  // Prepare Transfers
+  std::vector<Transfer> transfers(2);
+  transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
+
+  // SRC -> DST
+  transfers[0].numSrcs = transfers[0].numDsts = 1;
+  transfers[0].srcType.push_back(srcType);
+  transfers[0].dstType.push_back(dstType);
+  transfers[0].srcIndex.push_back(srcIndex);
+  transfers[0].dstIndex.push_back(dstIndex);
+
+  // DST -> SRC
+  transfers[1].numSrcs = transfers[1].numDsts = 1;
+  transfers[1].srcType.push_back(dstType);
+  transfers[1].dstType.push_back(srcType);
+  transfers[1].srcIndex.push_back(dstIndex);
+  transfers[1].dstIndex.push_back(srcIndex);
+
+  // Either perform (local read + remote write), or (remote read + local write)
+  ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+  transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
+  transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
+  transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
+  transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
+  transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+  transfers[1].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+
+  // Remove (DST->SRC) if not bidirectional
+  transfers.resize(isBidirectional + 1);
+
+  // Abort if executing on NUMA node with no CPUs
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
+      return 0;
+
+#if defined(__NVCC__)
+    // NVIDIA platform cannot access GPU memory directly from CPU executors
+    if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
+        return 0;
+#endif
+  }
+
+  ExecuteTransfers(ev, 0, N, transfers, false);
+
+  // Collect aggregate bandwidth
+  double totalBandwidth = 0;
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations);
+    double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+    totalBandwidth += transferBandwidthGbs;
+  }
+  return totalBandwidth;
+}
+
+void Transfer::PrepareSubExecParams(EnvVars const& ev)
+{
+  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
+  // Figure out the sub-array each subExecutor works on for this Transfer
+  // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
+  //   except the very last one, for alignment reasons
+  size_t const N              = this->numBytesActual / sizeof(float);
+  int    const initOffset     = ev.byteOffset / sizeof(float);
+  int    const targetMultiple = ev.blockBytes / sizeof(float);
+
+  // In some cases, there may not be enough data for all subExectors
+  int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
+  this->subExecParam.clear();
+  this->subExecParam.resize(this->numSubExecs);
+
+  size_t assigned = 0;
+  for (int i = 0; i < this->numSubExecs; ++i)
+  {
+    int    const subExecLeft = std::max(0, maxSubExecToUse - i);
+    size_t const leftover    = N - assigned;
+    size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
+
+    SubExecParam& p = this->subExecParam[i];
+    p.N             = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
+    p.numSrcs       = this->numSrcs;
+    p.numDsts       = this->numDsts;
+    for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc)
+      p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset;
+    for (int iDst = 0; iDst < this->numDsts; ++iDst)
+      p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
+
+    if (ev.enableDebug)
+    {
+      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
+             this->transferIndex, i, p.N, assigned, assigned + p.N);
+    }
+
+    p.startCycle = 0;
+    p.stopCycle  = 0;
+    assigned += p.N;
+  }
+
+  this->transferTime = 0.0;
+}
+
+void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
+{
+  size_t N = buffer.size();
+  if (bufferIdx >= 0)
+  {
+    size_t patternLen = ev.fillPattern.size();
+    if (patternLen > 0)
+    {
+      for (size_t i = 0; i < N; ++i)
+        buffer[i] = ev.fillPattern[i % patternLen];
+    }
+    else
+    {
+      for (size_t i = 0; i < N; ++i)
+        buffer[i] = PrepSrcValue(bufferIdx, i);
+    }
+  }
+  else // Destination buffer
+  {
+    if (this->numSrcs == 0)
+    {
+      // Note: 0x75757575 = 13323083.0
+      memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
+    }
+    else
+    {
+      PrepareReference(ev, buffer, 0);
+
+      if (this->numSrcs > 1)
+      {
+        std::vector<float> temp(N);
+        for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
+        {
+          PrepareReference(ev, temp, srcIdx);
+          for (int i = 0; i < N; ++i)
+          {
+            buffer[i] += temp[i];
+          }
+        }
+      }
+    }
+  }
+}
+
+bool Transfer::PrepareSrc(EnvVars const& ev)
+{
+  if (this->numSrcs == 0) return true;
+  size_t const N = this->numBytesActual / sizeof(float);
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  std::vector<float> reference(N);
+  for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
+  {
+    float* srcPtr = this->srcMem[srcIdx] + initOffset;
+    PrepareReference(ev, reference, srcIdx);
+
+    // Initialize source memory array with reference pattern
+    if (IsGpuType(this->srcType[srcIdx]))
+    {
+      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
+      HIP_CALL(hipSetDevice(deviceIdx));
+      if (ev.usePrepSrcKernel)
+        PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx);
+      else
+        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
+      HIP_CALL(hipDeviceSynchronize());
+    }
+    else if (IsCpuType(this->srcType[srcIdx]))
+    {
+      memcpy(srcPtr, reference.data(), this->numBytesActual);
+    }
+
+    // Perform check just to make sure that data has been copied properly
+    float* srcCheckPtr = srcPtr;
+    std::vector<float> srcCopy(N);
+    if (IsGpuType(this->srcType[srcIdx]))
+    {
+      if (!ev.validateDirect)
+      {
+        HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
+        HIP_CALL(hipDeviceSynchronize());
+        srcCheckPtr = srcCopy.data();
+      }
+    }
+
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (reference[i] != srcCheckPtr[i])
+      {
+        printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
+#if !defined(__NVCC__)
+        float const val = this->srcMem[srcIdx][initOffset + i];
+        printf("[ERROR] SRC %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+               srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
+#else
+        printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
+#endif
+        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
+        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
+               this->transferIndex,
+               this->SrcToStr().c_str(),
+               ExeTypeStr[this->exeType], this->exeIndex,
+               this->numSubExecs,
+               this->DstToStr().c_str());
+        if (!ev.continueOnError)
+          exit(1);
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void Transfer::ValidateDst(EnvVars const& ev)
+{
+  if (this->numDsts == 0) return;
+  size_t const N = this->numBytesActual / sizeof(float);
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  std::vector<float> reference(N);
+  PrepareReference(ev, reference, -1);
+
+  std::vector<float> hostBuffer(N);
+  for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
+  {
+    float* output;
+    if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
+    {
+      output = this->dstMem[dstIdx] + initOffset;
+    }
+    else
+    {
+      int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
+      HIP_CALL(hipSetDevice(deviceIdx));
+      HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
+      HIP_CALL(hipDeviceSynchronize());
+      output = hostBuffer.data();
+    }
+
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (reference[i] != output[i])
+      {
+        printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
+        for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
+        {
+          float srcVal;
+          HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
+#if !defined(__NVCC__)
+          float val = this->srcMem[srcIdx][initOffset + i];
+          printf("[ERROR] SRC %02dD  value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+                 srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
+#else
+          printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
+#endif
+        }
+        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
+#if !defined(__NVCC__)
+        float dstVal = this->dstMem[dstIdx][initOffset + i];
+        printf("[ERROR] DST %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+               dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
+#else
+        printf("[ERROR] DST %02d   value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
+#endif
+        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
+               this->transferIndex,
+               this->SrcToStr().c_str(),
+               ExeTypeStr[this->exeType], this->exeIndex,
+               this->numSubExecs,
+               this->DstToStr().c_str());
+        if (!ev.continueOnError)
+          exit(1);
+        else
+          break;
+      }
+    }
+  }
+}
+
+std::string Transfer::SrcToStr() const
+{
+  if (numSrcs == 0) return "N";
+  std::stringstream ss;
+  for (int i = 0; i < numSrcs; ++i)
+    ss << MemTypeStr[srcType[i]] << srcIndex[i];
+  return ss.str();
+}
+
+std::string Transfer::DstToStr() const
+{
+  if (numDsts == 0) return "N";
+  std::stringstream ss;
+  for (int i = 0; i < numDsts; ++i)
+    ss << MemTypeStr[dstType[i]] << dstIndex[i];
+  return ss.str();
+}
+
+// NOTE: This is a stop-gap solution until HIP provides wallclock values
+int GetWallClockRate(int deviceId)
+{
+  static std::vector<int> wallClockPerDeviceMhz;
+
+  if (wallClockPerDeviceMhz.size() == 0)
+  {
+    int numGpuDevices;
+    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+    wallClockPerDeviceMhz.resize(numGpuDevices);
+
+    for (int i = 0; i < numGpuDevices; i++)
+    {
+#if defined(__NVCC__)
+      int value = 1410000;
+      //HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
+      //value *= 1000;
+#else
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, i));
+      int value = 25000;
+      switch (prop.gcnArch)
+      {
+      case 906: case 910: value = 25000; break;
+      default:
+        printf("Unrecognized GCN arch %d\n", prop.gcnArch);
+      }
+#endif
+      wallClockPerDeviceMhz[i] = value;
+    }
+  }
+  return wallClockPerDeviceMhz[deviceId];
+}
+
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
+{
+  ev.DisplaySweepEnvVars();
+
+  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
+  std::vector<std::pair<ExeType, int>> exeList;
+  for (auto exe : ev.sweepExe)
+  {
+    ExeType const exeType = CharToExeType(exe);
+    if (IsGpuType(exeType))
+    {
+      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
+        exeList.push_back(std::make_pair(exeType, exeIndex));
+    }
+    else if (IsCpuType(exeType))
+    {
+      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
+      {
+        // Skip NUMA nodes that have no CPUs (e.g. CXL)
+        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
+        exeList.push_back(std::make_pair(exeType, exeIndex));
+      }
+    }
+  }
+  int numExes = exeList.size();
+
+  std::vector<std::pair<MemType, int>> srcList;
+  for (auto src : ev.sweepSrc)
+  {
+    MemType const srcType = CharToMemType(src);
+    int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
+
+    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
+      srcList.push_back(std::make_pair(srcType, srcIndex));
+  }
+  int numSrcs = srcList.size();
+
+
+  std::vector<std::pair<MemType, int>> dstList;
+  for (auto dst : ev.sweepDst)
+  {
+    MemType const dstType = CharToMemType(dst);
+    int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
+
+    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
+      dstList.push_back(std::make_pair(dstType, dstIndex));
+  }
+  int numDsts = dstList.size();
+
+  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
+  struct TransferInfo
+  {
+    MemType srcType; int srcIndex;
+    ExeType exeType; int exeIndex;
+    MemType dstType; int dstIndex;
+  };
+
+  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
+  bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
+
+  std::vector<TransferInfo> possibleTransfers;
+  TransferInfo tinfo;
+  for (int i = 0; i < numExes; ++i)
+  {
+    // Skip CPU executors if XGMI link must be used
+    if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
+    tinfo.exeType  = exeList[i].first;
+    tinfo.exeIndex = exeList[i].second;
+
+    bool isXgmiSrc  = false;
+    int  numHopsSrc = 0;
+    for (int j = 0; j < numSrcs; ++j)
+    {
+      if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
+      {
+        if (exeList[i].second != srcList[j].second)
+        {
+#if defined(__NVCC__)
+          isXgmiSrc = false;
+#else
+          uint32_t exeToSrcLinkType, exeToSrcHopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
+                                                RemappedIndex(srcList[j].second, false),
+                                                &exeToSrcLinkType,
+                                                &exeToSrcHopCount));
+          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
+#endif
+        }
+        else
+        {
+          isXgmiSrc = true;
+          numHopsSrc = 0;
+        }
+
+        // Skip this SRC if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiSrc) continue;
+
+        // Skip this SRC if XGMI distance is already past limit
+        if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
+      }
+      else if (useXgmiOnly) continue;
+
+      tinfo.srcType  = srcList[j].first;
+      tinfo.srcIndex = srcList[j].second;
+
+      bool isXgmiDst = false;
+      int  numHopsDst = 0;
+      for (int k = 0; k < numDsts; ++k)
+      {
+        if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
+        {
+          if (exeList[i].second != dstList[k].second)
+          {
+#if defined(__NVCC__)
+            isXgmiSrc = false;
+#else
+            uint32_t exeToDstLinkType, exeToDstHopCount;
+            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
+                                                  RemappedIndex(dstList[k].second, false),
+                                                  &exeToDstLinkType,
+                                                  &exeToDstHopCount));
+            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+            if (isXgmiDst) numHopsDst = exeToDstHopCount;
+#endif
+          }
+          else
+          {
+            isXgmiDst = true;
+            numHopsDst = 0;
+          }
+        }
+
+        // Skip this DST if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiDst) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
+        if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
+        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
+
+#if defined(__NVCC__)
+        // Skip CPU executors on GPU memory on NVIDIA platform
+        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
+          continue;
+#endif
+
+        tinfo.dstType  = dstList[k].first;
+        tinfo.dstIndex = dstList[k].second;
+
+        possibleTransfers.push_back(tinfo);
+      }
+    }
+  }
+
+  int const numPossible = (int)possibleTransfers.size();
+  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
+
+  if (ev.sweepMin > numPossible)
+  {
+    printf("No valid test configurations exist\n");
+    return;
+  }
+
+  if (ev.outputToCsv)
+  {
+    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
+           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
+  }
+
+  int numTestsRun = 0;
+  int M = ev.sweepMin;
+  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
+  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
+
+  // Log sweep to configuration file
+  FILE *fp = fopen("lastSweep.cfg", "w");
+  if (!fp)
+  {
+    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
+    exit(1);
+  }
+
+  // Create bitmask of numPossible triplets, of which M will be chosen
+  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
+  auto cpuStart = std::chrono::high_resolution_clock::now();
+  while (1)
+  {
+    if (isRandom)
+    {
+      // Pick random number of simultaneous transfers to execute
+      // NOTE: This currently skews distribution due to some #s having more possibilities than others
+      M = distribution(*ev.generator);
+
+      // Generate a random bitmask
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+      std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
+    }
+
+    // Convert bitmask to list of Transfers
+    std::vector<Transfer> transfers;
+    for (int value = 0; value < numPossible; ++value)
+    {
+      if (bitmask[value])
+      {
+        // Convert integer value to (SRC->EXE->DST) triplet
+        Transfer transfer;
+        transfer.numSrcs        = 1;
+        transfer.numDsts        = 1;
+        transfer.srcType        = {possibleTransfers[value].srcType};
+        transfer.srcIndex       = {possibleTransfers[value].srcIndex};
+        transfer.exeType        = possibleTransfers[value].exeType;
+        transfer.exeIndex       = possibleTransfers[value].exeIndex;
+        transfer.dstType        = {possibleTransfers[value].dstType};
+        transfer.dstIndex       = {possibleTransfers[value].dstIndex};
+        transfer.numSubExecs    = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
+        transfer.transferIndex  = transfers.size();
+        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
+        transfers.push_back(transfer);
+      }
+    }
+
+    LogTransfers(fp, ++numTestsRun, transfers);
+    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
+
+    // Check for test limit
+    if (numTestsRun == ev.sweepTestLimit)
+    {
+      printf("Test limit reached\n");
+      break;
+    }
+
+    // Check for time limit
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
+    {
+      printf("Time limit exceeded\n");
+      break;
+    }
+
+    // Increment bitmask if not random sweep
+    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
+    {
+      M++;
+      // Check for completion
+      if (M > maxParallelTransfers)
+      {
+        printf("Sweep complete\n");
+        break;
+      }
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+    }
+  }
+  fclose(fp);
+}
+
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
+{
+  fprintf(fp, "# Test %d\n", testNum);
+  fprintf(fp, "%d", -1 * (int)transfers.size());
+  for (auto const& transfer : transfers)
+  {
+    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
+            MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
+            ExeTypeStr[transfer.exeType],    transfer.exeIndex,
+            MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
+            transfer.numSubExecs,
+            transfer.numBytes);
+  }
+  fprintf(fp, "\n");
+  fflush(fp);
+}
+
+std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
+{
+  std::stringstream ss;
+  for (int i = 0; i < strVector.size(); ++i)
+  {
+    if (i) ss << " ";
+    ss << (strVector[i] + initOffset);
+  }
+  return ss.str();
+}
diff --git a/transferbench.so/tests.cmake b/transferbench.so/tests.cmake
new file mode 100644
index 00000000..c1b6b74e
--- /dev/null
+++ b/transferbench.so/tests.cmake
@@ -0,0 +1,27 @@
+################################################################################
+##
+## Copyright (c) 2018 ROCm Developer Tools
+##
+## MIT LICENSE:
+## Permission is hereby granted, free of charge, to any person obtaining a copy of
+## this software and associated documentation files (the "Software"), to deal in
+## the Software without restriction, including without limitation the rights to
+## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+## of the Software, and to permit persons to whom the Software is furnished to do
+## so, subject to the following conditions:
+##
+## The above copyright notice and this permission notice shall be included in all
+## copies or substantial portions of the Software.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+## SOFTWARE.
+##
+################################################################################
+
+
+include(tests_conf_logging)
diff --git a/transferbench.so/transferbench/CHANGELOG.md b/transferbench.so/transferbench/CHANGELOG.md
new file mode 100644
index 00000000..76932de4
--- /dev/null
+++ b/transferbench.so/transferbench/CHANGELOG.md
@@ -0,0 +1,199 @@
+# Changelog for TransferBench
+
+## v1.22
+### Modified
+- Switching kernel timing function to wall_clock64
+
+## v1.21
+### Fixed
+- Fixed bug with SAMPLING_FACTOR
+
+## v1.20
+### Fixed
+- VALIDATE_DIRECT can now be used with USE_PREP_KERNEL
+- Switch to local GPU for validating GPU memory
+
+## v1.19
+### Added
+- VALIDATE_DIRECT now also applies to source memory array checking
+- Adding null memory pointer check prior to deallocation
+
+## v1.18
+### Added
+- Adding ability to validate GPU destination memory directly without going through CPU staging buffer (VALIDATE_DIRECT)
+  - NOTE: This will only work on AMD devices with large-bar access enable and may slow things down considerably
+### Changed
+- Refactored how environment variables are displayed
+- Mismatch stops after first detected error within an array instead of list all mismatched elements
+
+## v1.17
+### Added
+- Allow switch to GFX kernel for source array initialization (USE_PREP_KERNEL)
+  - USE_PREP_KERNEL cannot be used with FILL_PATTERN
+- Adding ability to compile with nvcc only (TransferBenchCuda)
+### Changed
+- Default pattern set to [Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)]
+### Fixed
+- Re-adding example.cfg file
+
+## v1.16
+### Added
+- Additional src array validation during preparation
+- Adding new env var CONTINUE_ON_ERROR to resume tests after mis-match detection
+- Initializing GPU memory to 0 during allocation
+
+## v1.15
+### Fixed
+- Fixed a bug that prevented single Transfers > 8GB
+### Changed
+- Removed "check for latest ROCm" warning when allocating too much memory
+- Printing off source memory value as well when mis-match is detected
+
+## v1.14
+### Added
+- Added documentation
+- Added pthread linking in src/Makefile and CMakeLists.txt
+- Added printing off the hex value of the floats for output and reference
+
+## v1.13
+### Added
+- Added support for cmake
+
+### Changed
+- Converted to the Pitchfork layout standard
+
+## v1.12
+### Added
+- Added support for TransferBench on NVIDIA platforms (via HIP_PLATFORM=nvidia)
+  - CPU executors on NVIDIA platform cannot access GPU memory (no large-bar access)
+
+## v1.11
+### Added
+- New multi-input / multi-output support (MIMO).  Transfers now can reduce (element-wise summation) multiple input memory arrays
+  and write the sums to multiple outputs
+- New GPU-DMA executor 'D' (uses hipMemcpy for SDMA copies).  Previously this was done using USE_HIP_CALL, but now this allows
+  GPU-GFX kernel to run in parallel with GPU-DMA instead of applying to all GPU executors globally.
+  - GPU-DMA executor can only be used for single-input/single-output Transfers
+  - GPU-DMA executor can only be associated with one SubExecutor
+- Added new "Null" memory type 'N', which represents empty memory. This allows for read-only or write-only Transfers
+- Added new GPU_KERNEL environment variable that allows for switching between various GPU-GFX reduction kernels
+
+### Optimized
+- Slightly improved GPU-GFX kernel performance based on hardware architecture when running with fewer CUs
+
+### Changed
+- Updated the example.cfg file to cover the new features
+- Updated output to support MIMO
+- Changed CUs/CPUs threads naming to SubExecutors for consistency
+- Sweep Preset:
+  - Default sweep preset executors now includes DMA
+- P2P Benchmarks:
+  - Now only works via "p2p".  Removed "p2p_rr", "g2g" and "g2g_rr".
+    - Setting NUM_CPU_DEVICES=0 can be used to only benchmark GPU devices (like "g2g")
+    - New environment variable USE_REMOTE_READ replaces "_rr" presets
+  - New environment variable USE_GPU_DMA=1 replaces USE_HIP_CALL=1 for benchmarking with GPU-DMA Executor
+  - Number of GPU SubExecutors for benchmark can be specified via NUM_GPU_SE
+    - Defaults to all CUs for GPU-GFX, 1 for GPU-DMA
+  - Number of CPU SubExecutors for benchmark can be specified via NUM_CPU_SE
+- Psuedo-random input pattern has been slightly adjusted to have different patterns for each input array within same Transfer
+
+### Removed
+- USE_HIP_CALL has been removed.  Use GPU-DMA executor 'D' or set USE_GPU_DMA=1 for P2P benchmark presets
+  - Currently warning will be issued if USE_HIP_CALL is set to 1 and program will terminate
+- Removed NUM_CPU_PER_TRANSFER - The number of CPU SubExecutors will be whatever is specified for the Transfer
+- Removed USE_MEMSET environment variable.  This can now be done via a Transfer using the null memory type
+
+## v1.10
+### Fixed
+- Fix incorrect bandwidth calculation when using single stream mode and per-Transfer data sizes
+
+## v1.09
+### Added
+- Printing off src/dst memory addresses during interactive mode
+### Changed
+- Switching to numa_set_preferred instead of set_mempolicy
+
+## v1.08
+### Changed
+- Fixing handling of non-configured NUMA nodes
+- Topology detection now shows actual NUMA node indices
+- Fix for issue with NUM_GPU_DEVICES
+
+## v1.07
+### Changed
+- Fix bug with allocations involving non-default CPU memory types
+
+## v1.06
+### Added
+- Added unpinned CPU memory type ('U').  May require HSA_XNACK=1 in order to access via GPU executors
+- Adding logging of sweep configuration to lastSweep.cfg
+- Adding ability to specify number of CUs to use for sweep-based presets
+### Changed
+- Fixing random sweep repeatibility
+- Fixing bug with CPU NUMA node memory allocation
+- Modified advanced configuration file format to accept bytes per Transfer
+
+## v1.05
+### Added
+- Topology output now includes NUMA node information
+- Support for NUMA nodes with no CPU cores (e.g. CXL memory)
+### Removed
+- SWEEP_SRC_IS_EXE environment variable
+
+## v1.04
+### Added
+- New environment variables for sweep based presets
+  - SWEEP_XGMI_MIN   - Min number of XGMI hops for Transfers
+  - SWEEP_XGMI_MAX   - Max number of XGMI hops for Transfers
+  - SWEEP_SEED       - Random seed being used
+  - SWEEP_RAND_BYTES - Use random amount of bytes (up to pre-specified N) for each Transfer
+### Changed
+  - CSV output for sweep includes env vars section followed by output
+  - CSV output no longer lists env var parameters in columns
+  - Default number of warmup iterations changed from 3 to 1
+  - Splitting CSV output of link type to ExeToSrcLinkType and ExeToDstLinkType
+
+## v1.03
+### Added
+- New preset modes stress-test benchmarks "sweep" and "randomsweep"
+  - sweep iterates over all possible sets of Transfers to test
+  - randomsweep iterates over random sets of Transfers
+  -  New sweep-only environment variables can modify sweep
+     - SWEEP_SRC - String containing only "B","C","F", or "G", defining possible source memory types
+     - SWEEP_EXE - String containing only "C", or "G", defining possible executors
+     - SWEEP_DST - String containing only "B","C","F", or "G", defining possible destination memory types
+     - SWEEP_SRC_IS_EXE - Restrict executor to be the same as the source if non-zero
+     - SWEEP_MIN - Minimum number of parallel transfers to test
+     - SWEEP_MAX - Maximum number of parallel transfers to test
+     - SWEEP_COUNT - Maximum number of tests to run
+     - SWEEP_TIME_LIMIT - Maximum number of seconds to run tests for
+- New environment variable to restrict number of available GPUs to test on (primarily for sweep runs)
+  - NUM_CPU_DEVICES - Number of CPU devices
+  - NUM_GPU_DEVICES - Number of GPU devices
+### Changed
+- Fixed timing display for CPU-executors when using single stream mode
+
+## v1.02
+### Added
+- Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test
+### Changed
+- Copies are now refered to as Transfers instead of Links
+- Re-ordering how env vars are displayed (alphabetically now)
+### Removed
+- Combined timing is now always on for kernel-based GPU copies. COMBINED_TIMING env var has been removed
+- Use single sync is no longer supported to facility variable iterations. USE_SINGLE_SYNC env var has been removed
+
+## v1.01
+### Added
+- Adding USE_SINGLE_STREAM feature
+  - All Links that execute on the same GPU device are executed with a single kernel launch on a single stream
+  - Does not work with USE_HIP_CALL and forces USE_SINGLE_SYNC to collect timings
+  - Adding ability to request coherent / fine-grained host memory ('B')
+### Changed
+- Separating TransferBench from RCCL repo
+- Peer-to-peer benchmark mode now works OUTPUT_TO_CSV
+- Toplogy display now works with OUTPUT_TO_CSV
+- Moving documentation about config file into example.cfg
+### Removed
+- Removed config file generation
+- Removed show pointer address environment variable (SHOW_ADDR)
diff --git a/transferbench.so/transferbench/CMakeLists.txt b/transferbench.so/transferbench/CMakeLists.txt
new file mode 100644
index 00000000..18df727b
--- /dev/null
+++ b/transferbench.so/transferbench/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+if (DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
+else()
+    set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory")
+endif()
+cmake_minimum_required(VERSION 3.5)
+
+project(TransferBench VERSION 1.0 LANGUAGES CXX)
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
+include_directories(${ROCM_PATH}/include)
+link_libraries(numa hsa-runtime64 pthread)
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..)
+add_executable(TransferBench src/TransferBench.cpp)
+target_include_directories(TransferBench PRIVATE src/include)
+
+find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
+include(ROCMInstallTargets)
+include(ROCMCreatePackage)
+set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF)
+
+set(PACKAGE_NAME TB)
+set(LIBRARY_NAME TransferBench)
+
+rocm_install(TARGETS TransferBench)
+
+rocm_create_package(
+    NAME ${LIBRARY_NAME}
+    DESCRIPTION "TransferBench package"
+    MAINTAINER "RCCL Team <gilbert.lee@amd.com>"
+)
diff --git a/transferbench.so/transferbench/LICENSE.md b/transferbench.so/transferbench/LICENSE.md
new file mode 100644
index 00000000..cf8533aa
--- /dev/null
+++ b/transferbench.so/transferbench/LICENSE.md
@@ -0,0 +1,19 @@
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/transferbench.so/transferbench/README.md b/transferbench.so/transferbench/README.md
new file mode 100644
index 00000000..b1155f78
--- /dev/null
+++ b/transferbench.so/transferbench/README.md
@@ -0,0 +1,63 @@
+# TransferBench
+
+TransferBench is a simple utility capable of benchmarking simultaneous copies between user-specified devices (CPUs/GPUs).
+
+## Requirements
+
+1. ROCm stack installed on the system (HIP runtime)
+2. libnuma installed on system
+
+## Documentation
+
+Run the steps below to build documentation locally.
+
+```
+cd docs
+
+pip3 install -r .sphinx/requirements.txt
+
+python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+```
+
+## Building
+  To build TransferBench using Makefile:
+ ```shell
+ $ make
+ ```
+
+  To build TransferBench using cmake:
+ ```shell
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/bin/hipcc cmake ..
+$ make
+ ```
+
+  If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
+
+## NVIDIA platform support
+
+TransferBench may also be built to run on NVIDIA platforms either via HIP, or native nvcc
+
+To build with HIP for NVIDIA (requires HIP-compatible CUDA version installed e.g. CUDA 11.5):
+```
+   CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make`
+```
+
+To build with native nvcc: (Builds TransferBenchCuda)
+```
+   make
+```
+
+## Hints and suggestions
+- Running TransferBench with no arguments will display usage instructions and detected topology information
+- There are several preset configurations that can be used instead of a configuration file
+  including:
+  - p2p    - Peer to peer benchmark test
+  - sweep  - Sweep across possible sets of Transfers
+  - rsweep - Random sweep across possible sets of Transfers
+- When using the same GPU executor in multiple simultaneous Transfers, performance may be
+  serialized due to the maximum number of hardware queues available.
+  - The number of maximum hardware queues can be adjusted via GPU_MAX_HW_QUEUES
+  - Alternatively, running in single stream mode (USE_SINGLE_STREAM=1) may avoid this issue
+    by launching all Transfers on a single stream instead of individual streams
diff --git a/transferbench.so/transferbench/docs/.doxygen/Doxyfile b/transferbench.so/transferbench/docs/.doxygen/Doxyfile
new file mode 100644
index 00000000..fbebc2a9
--- /dev/null
+++ b/transferbench.so/transferbench/docs/.doxygen/Doxyfile
@@ -0,0 +1,2455 @@
+# Doxyfile 1.8.10
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "hipblas"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = v4.0.5.0
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and HiP"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = 
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docBin
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = YES
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = YES
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = ../../src/ \
+                         ../../src/include
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
+# *.vhdl, *.ucf, *.qsf, *.as and *.js.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.as \
+                         *.js
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = ../README.md
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# compiled with the --with-libclang option.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 1
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = YES
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = NO
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = __attribute__(x)= \
+                         __inline= \
+                         HIPBLAS_EXPORT= \
+                         HIPBLAS_DEPRECATED_MSG(x)=
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = NO
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/transferbench.so/transferbench/docs/.sphinx/_toc.yml.in b/transferbench.so/transferbench/docs/.sphinx/_toc.yml.in
new file mode 100644
index 00000000..27694f35
--- /dev/null
+++ b/transferbench.so/transferbench/docs/.sphinx/_toc.yml.in
@@ -0,0 +1,13 @@
+# Anywhere {branch} is used, the branch name will be substituted.
+# These comments will also be removed.
+
+defaults:
+  numbered: False
+  maxdepth: 6
+
+root: index
+subtrees:
+  - entries:
+    - file: instructions
+    - file: examples/index
+    - file: api
diff --git a/transferbench.so/transferbench/docs/.sphinx/requirements.in b/transferbench.so/transferbench/docs/.sphinx/requirements.in
new file mode 100644
index 00000000..b8366edf
--- /dev/null
+++ b/transferbench.so/transferbench/docs/.sphinx/requirements.in
@@ -0,0 +1 @@
+rocm-docs-core==0.11.0
diff --git a/transferbench.so/transferbench/docs/.sphinx/requirements.txt b/transferbench.so/transferbench/docs/.sphinx/requirements.txt
new file mode 100644
index 00000000..3c1eba2e
--- /dev/null
+++ b/transferbench.so/transferbench/docs/.sphinx/requirements.txt
@@ -0,0 +1,155 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+accessible-pygments==0.0.3
+    # via pydata-sphinx-theme
+alabaster==0.7.13
+    # via sphinx
+babel==2.12.1
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+beautifulsoup4==4.11.2
+    # via pydata-sphinx-theme
+breathe==4.34.0
+    # via rocm-docs-core
+certifi==2022.12.7
+    # via requests
+cffi==1.15.1
+    # via
+    #   cryptography
+    #   pynacl
+charset-normalizer==3.1.0
+    # via requests
+click==8.1.3
+    # via sphinx-external-toc
+cryptography==40.0.2
+    # via pyjwt
+deprecated==1.2.13
+    # via pygithub
+docutils==0.19
+    # via
+    #   breathe
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   sphinx
+fastjsonschema==2.16.3
+    # via rocm-docs-core
+gitdb==4.0.10
+    # via gitpython
+gitpython==3.1.31
+    # via rocm-docs-core
+idna==3.4
+    # via requests
+imagesize==1.4.1
+    # via sphinx
+importlib-metadata==6.0.0
+    # via sphinx
+importlib-resources==5.12.0
+    # via rocm-docs-core
+jinja2==3.1.2
+    # via
+    #   myst-parser
+    #   sphinx
+linkify-it-py==1.0.3
+    # via myst-parser
+markdown-it-py==2.2.0
+    # via
+    #   mdit-py-plugins
+    #   myst-parser
+markupsafe==2.1.2
+    # via jinja2
+mdit-py-plugins==0.3.5
+    # via myst-parser
+mdurl==0.1.2
+    # via markdown-it-py
+myst-parser[linkify]==1.0.0
+    # via rocm-docs-core
+packaging==23.0
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+pycparser==2.21
+    # via cffi
+pydata-sphinx-theme==0.13.3
+    # via
+    #   rocm-docs-core
+    #   sphinx-book-theme
+pygithub==1.58.1
+    # via rocm-docs-core
+pygments==2.14.0
+    # via
+    #   accessible-pygments
+    #   pydata-sphinx-theme
+    #   sphinx
+pyjwt[crypto]==2.6.0
+    # via pygithub
+pynacl==1.5.0
+    # via pygithub
+pytz==2023.3
+    # via babel
+pyyaml==6.0
+    # via
+    #   myst-parser
+    #   rocm-docs-core
+    #   sphinx-external-toc
+requests==2.28.2
+    # via
+    #   pygithub
+    #   sphinx
+rocm-docs-core==0.11.0
+    # via -r requirements.in
+smmap==5.0.0
+    # via gitdb
+snowballstemmer==2.2.0
+    # via sphinx
+soupsieve==2.4
+    # via beautifulsoup4
+sphinx==5.3.0
+    # via
+    #   breathe
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   rocm-docs-core
+    #   sphinx-book-theme
+    #   sphinx-copybutton
+    #   sphinx-design
+    #   sphinx-external-toc
+    #   sphinx-notfound-page
+sphinx-book-theme==1.0.1
+    # via rocm-docs-core
+sphinx-copybutton==0.5.1
+    # via rocm-docs-core
+sphinx-design==0.4.1
+    # via rocm-docs-core
+sphinx-external-toc==0.3.1
+    # via rocm-docs-core
+sphinx-notfound-page==0.8.3
+    # via rocm-docs-core
+sphinxcontrib-applehelp==1.0.4
+    # via sphinx
+sphinxcontrib-devhelp==1.0.2
+    # via sphinx
+sphinxcontrib-htmlhelp==2.0.1
+    # via sphinx
+sphinxcontrib-jsmath==1.0.1
+    # via sphinx
+sphinxcontrib-qthelp==1.0.3
+    # via sphinx
+sphinxcontrib-serializinghtml==1.1.5
+    # via sphinx
+typing-extensions==4.5.0
+    # via pydata-sphinx-theme
+uc-micro-py==1.0.1
+    # via linkify-it-py
+urllib3==1.26.15
+    # via requests
+wrapt==1.15.0
+    # via deprecated
+zipp==3.15.0
+    # via
+    #   importlib-metadata
+    #   importlib-resources
diff --git a/transferbench.so/transferbench/docs/api.rst b/transferbench.so/transferbench/docs/api.rst
new file mode 100644
index 00000000..42357fdc
--- /dev/null
+++ b/transferbench.so/transferbench/docs/api.rst
@@ -0,0 +1,5 @@
+-----
+API
+-----
+
+.. doxygenindex::
diff --git a/transferbench.so/transferbench/docs/conf.py b/transferbench.so/transferbench/docs/conf.py
new file mode 100644
index 00000000..0eedf91c
--- /dev/null
+++ b/transferbench.so/transferbench/docs/conf.py
@@ -0,0 +1,14 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+from rocm_docs import ROCmDocs
+
+docs_core = ROCmDocs("TransferBench Documentation")
+docs_core.setup()
+docs_core.run_doxygen()
+
+for sphinx_var in ROCmDocs.SPHINX_VARS:
+    globals()[sphinx_var] = getattr(docs_core, sphinx_var)
diff --git a/transferbench.so/transferbench/docs/examples/configfile_format.rst b/transferbench.so/transferbench/docs/examples/configfile_format.rst
new file mode 100644
index 00000000..ad751b28
--- /dev/null
+++ b/transferbench.so/transferbench/docs/examples/configfile_format.rst
@@ -0,0 +1,93 @@
+--------------------
+ConfigFile Format
+--------------------
+
+A Transfer is defined as a single operation where an Executor reads and adds together
+values from Source (SRC) memory locations, then writes the sum to destination (DST) memory locations.
+This simplifies to a simple copy operation when dealing with single SRC/DST.::
+
+   SRC 0                DST 0
+   SRC 1 -> Executor -> DST 1
+   SRC X                DST Y
+
+Three Executors are supported by TransferBench::
+
+   Executor:        SubExecutor:
+   1) CPU           CPU thread
+   2) GPU           GPU threadblock/Compute Unit (CU)
+   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
+
+Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
+
+There are two ways to specify a Test:
+
+1) Basic
+
+   The basic specification assumes the same number of SubExecutors (SE) used per Transfer
+   A positive number of Transfers is specified followed by that number of triplets describing each Transfer
+
+   Transfers SEs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
+
+2) Advanced
+
+   A negative number of Transfers is specified, followed by quintuplets describing each Transfer
+   A non-zero number of bytes specified will override any provided value
+
+   -Transfers (srcMem1->Executor1->dstMem1 SEs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL SEsL BytesL)
+
+Argument Details:::
+
+   Transfers:   Number of Transfers to be run in parallel
+   SEs      :   Number of SubExectors to use (CPU threads/ GPU threadblocks)
+   srcMemL   :   Source memory locations (Where the data is to be read from)
+   Executor  :   Executor is specified by a character indicating type, followed by device index (0-indexed)
+                  - C: CPU-executed  (Indexed from 0 to NUMA nodes - 1)
+                  - G: GPU-executed  (Indexed from 0 to GPUs - 1)
+                  - D: DMA-executor  (Indexed from 0 to GPUs - 1)
+   dstMemL   :   Destination memory locations (Where the data is to be written to)
+   bytesL    :   Number of bytes to copy (0 means use command-line specified size)
+                  Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
+
+                  Memory locations are specified by one or more (device character / device index) pairs
+                  Character indicating memory type followed by device index (0-indexed)
+                  Supported memory locations are:
+                  - C:    Pinned host memory       (on NUMA node, indexed from 0 to [NUMA nodes-1])
+                  - U:    Unpinned host memory     (on NUMA node, indexed from 0 to [NUMA nodes-1])
+                  - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [NUMA nodes-1])
+                  - G:    Global device memory     (on GPU device indexed from 0 to [GPUs - 1])
+                  - F:    Fine-grain device memory (on GPU device indexed from 0 to [GPUs - 1])
+                  - N:    Null memory              (index ignored)
+
+Examples:::
+
+   1 4 (G0->G0->G1)                   Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
+   1 4 (C1->G2->G0)                   Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
+   2 4 G0->G0->G1 G1->G1->G0          Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
+   -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs
+
+Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
+Lines starting with will be ignored. Lines starting with will be echoed to output
+
+Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs::
+
+   1 4 (G0->G0->G1)
+
+Single DMA executed Transfer between GPUs 0 and 1::
+
+   1 1 (G0->D0->G1)
+
+Copy 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs::
+
+   -2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)
+
+"Memset" by GPU 0 to GPU 0 memory::
+
+   1 32 (N0->G0->G0)
+
+"Read-only" by CPU 0::
+
+   1 4 (C0->C0->N0)
+
+Broadcast from GPU 0 to GPU 0 and GPU 1::
+
+   1 16 (G0->G0->G0G1)
diff --git a/transferbench.so/transferbench/docs/examples/index.rst b/transferbench.so/transferbench/docs/examples/index.rst
new file mode 100644
index 00000000..f6712e63
--- /dev/null
+++ b/transferbench.so/transferbench/docs/examples/index.rst
@@ -0,0 +1,11 @@
+.. _Examples:
+
+--------------------
+Examples
+--------------------
+
+.. toctree::
+   :maxdepth: 3
+   :caption: Contents:
+
+   configfile_format
diff --git a/transferbench.so/transferbench/docs/index.rst b/transferbench.so/transferbench/docs/index.rst
new file mode 100644
index 00000000..1a6aaa2b
--- /dev/null
+++ b/transferbench.so/transferbench/docs/index.rst
@@ -0,0 +1,11 @@
+*******************************************
+Welcome to TransferBench's documentation!
+*******************************************
+TransferBench is a simple utility capable of benchmarking simultaneous transfers between user-specified devices (CPUs/GPUs).
+A Transfer is defined as a single operation where an executor reads and adds together values from source (SRC) memory locations, then writes the sum to destination (DST) memory locations. This simplifies to a simple copy operation when dealing with single SRC/DST.
+
+The user has control over the SRC and DST memory locations by indicating memory type followed by the device index. TransferBench supports coarse-grained pinned host memory, unpinned host memory, fine-grained host memory, coarse-grained global device memory, fine-grained global device memory, and null memory (for an empty transfer). In addition, the user can determine the size of the transfer (number of bytes to copy) for their tests.
+
+The executor of the transfer can also be specified by the user. The options are CPU, kernel-based GPU, and SDMA-based GPU (DMA) executors. TransferBench also provides the option to choose the number of sub-executors. In case of a CPU executor this argument specifies the number of CPU threads, while for a GPU executor it defines the number of compute units (CU). If DMA is specified as the executor, the sub-executor argument determines the number of streams to be used.
+
+For more examples, please refer to :ref:`Examples`
diff --git a/transferbench.so/transferbench/docs/instructions.rst b/transferbench.so/transferbench/docs/instructions.rst
new file mode 100644
index 00000000..7f8920e6
--- /dev/null
+++ b/transferbench.so/transferbench/docs/instructions.rst
@@ -0,0 +1,36 @@
+-------------
+Requirements
+-------------
+
+1. ROCm stack installed on the system (HIP runtime)
+2. libnuma installed on system
+
+-------------
+Building
+-------------
+
+To build TransferBench using Makefile:
+::
+
+    $ make
+
+To build TransferBench using cmake:
+::
+
+    $ mkdir build
+    $ cd build
+    $ CXX=/opt/rocm/bin/hipcc cmake ..
+    $ make
+
+If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
+
+--------------------------
+NVIDIA platform support
+--------------------------
+
+TransferBench may also be built to run on NVIDIA platforms via HIP, but requires a HIP-compatible CUDA version installed (e.g. CUDA 11.5)
+
+To build:
+::
+    
+   CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make`
diff --git a/transferbench.so/transferbench/examples/example.cfg b/transferbench.so/transferbench/examples/example.cfg
new file mode 100644
index 00000000..5f5d7cc8
--- /dev/null
+++ b/transferbench.so/transferbench/examples/example.cfg
@@ -0,0 +1,79 @@
+# ConfigFile Format:
+# ==================
+# A Transfer is defined as a single operation where an Executor reads and adds together
+# values from Source (SRC) memory locations, then writes the sum to destination (DST) memory locations.
+# This simplifies to a simple copy operation when dealing with single SRC/DST.
+#
+#                SRC 0                DST 0
+#                SRC 1 -> Executor -> DST 1
+#                SRC X                DST Y
+
+# Three Executors are supported by TransferBench
+#   Executor:        SubExecutor:
+#   1) CPU           CPU thread
+#   2) GPU           GPU threadblock/Compute Unit (CU)
+#   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
+
+# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
+
+# There are two ways to specify a Test:
+
+# 1) Basic
+#    The basic specification assumes the same number of SubExecutors (SE) used per Transfer
+#    A positive number of Transfers is specified followed by that number of triplets describing each Transfer
+
+#    #Transfers #SEs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
+
+# 2) Advanced
+#    A negative number of Transfers is specified, followed by quintuplets describing each Transfer
+#    A non-zero number of bytes specified will override any provided value
+#    -#Transfers (srcMem1->Executor1->dstMem1 #SEs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #SEsL BytesL)
+
+# Argument Details:
+#   #Transfers:   Number of Transfers to be run in parallel
+#   #SEs      :   Number of SubExectors to use (CPU threads/ GPU threadblocks)
+#   srcMemL   :   Source memory locations (Where the data is to be read from)
+#   Executor  :   Executor is specified by a character indicating type, followed by device index (0-indexed)
+#                 - C: CPU-executed  (Indexed from 0 to # NUMA nodes - 1)
+#                 - G: GPU-executed  (Indexed from 0 to # GPUs - 1)
+#                 - D: DMA-executor  (Indexed from 0 to # GPUs - 1)
+#   dstMemL   :   Destination memory locations (Where the data is to be written to)
+#   bytesL    :   Number of bytes to copy (0 means use command-line specified size)
+#                 Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
+#
+#                 Memory locations are specified by one or more (device character / device index) pairs
+#                 Character indicating memory type followed by device index (0-indexed)
+#                 Supported memory locations are:
+#                 - C:    Pinned host memory       (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#                 - U:    Unpinned host memory     (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#                 - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#                 - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
+#                 - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
+#                 - N:    Null memory              (index ignored)
+
+# Examples:
+# 1 4 (G0->G0->G1)                   Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
+# 1 4 (C1->G2->G0)                   Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
+# 2 4 G0->G0->G1 G1->G1->G0          Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
+# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs
+
+# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
+# Lines starting with # will be ignored. Lines starting with ## will be echoed to output
+
+## Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
+1 4 (G0->G0->G1)
+
+## Single DMA executed Transfer between GPUs 0 and 1
+1 1 (G0->D0->G1)
+
+## Copy 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs
+-2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)
+
+## "Memset" by GPU 0 to GPU 0 memory
+1 32 (N0->G0->G0)
+
+## "Read-only" by CPU 0
+1 4 (C0->C0->N0)
+
+## Broadcast from GPU 0 to GPU 0 and GPU 1
+1 16 (G0->G0->G0G1)

From d1a9d0f2dd3f153c76c96038e7fb9a317496dc17 Mon Sep 17 00:00:00 2001
From: Manoj S K <Manoj.SK@amd.com>
Date: Wed, 2 Aug 2023 08:54:01 +0000
Subject: [PATCH 2/7] remove docs

---
 .../transferbench/docs/.doxygen/Doxyfile      | 2455 -----------------
 .../transferbench/docs/.sphinx/_toc.yml.in    |   13 -
 .../docs/.sphinx/requirements.in              |    1 -
 .../docs/.sphinx/requirements.txt             |  155 --
 transferbench.so/transferbench/docs/api.rst   |    5 -
 transferbench.so/transferbench/docs/conf.py   |   14 -
 .../docs/examples/configfile_format.rst       |   93 -
 .../transferbench/docs/examples/index.rst     |   11 -
 transferbench.so/transferbench/docs/index.rst |   11 -
 .../transferbench/docs/instructions.rst       |   36 -
 10 files changed, 2794 deletions(-)
 delete mode 100644 transferbench.so/transferbench/docs/.doxygen/Doxyfile
 delete mode 100644 transferbench.so/transferbench/docs/.sphinx/_toc.yml.in
 delete mode 100644 transferbench.so/transferbench/docs/.sphinx/requirements.in
 delete mode 100644 transferbench.so/transferbench/docs/.sphinx/requirements.txt
 delete mode 100644 transferbench.so/transferbench/docs/api.rst
 delete mode 100644 transferbench.so/transferbench/docs/conf.py
 delete mode 100644 transferbench.so/transferbench/docs/examples/configfile_format.rst
 delete mode 100644 transferbench.so/transferbench/docs/examples/index.rst
 delete mode 100644 transferbench.so/transferbench/docs/index.rst
 delete mode 100644 transferbench.so/transferbench/docs/instructions.rst

diff --git a/transferbench.so/transferbench/docs/.doxygen/Doxyfile b/transferbench.so/transferbench/docs/.doxygen/Doxyfile
deleted file mode 100644
index fbebc2a9..00000000
--- a/transferbench.so/transferbench/docs/.doxygen/Doxyfile
+++ /dev/null
@@ -1,2455 +0,0 @@
-# Doxyfile 1.8.10
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "hipblas"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = v4.0.5.0
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          = "prototype interfaces compatible with ROCm platform and HiP"
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           = 
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = docBin
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = YES
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = YES
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = YES
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = YES
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = NO
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = NO
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = ../../src/ \
-                         ../../src/include
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
-
-FILE_PATTERNS          = *.c \
-                         *.cc \
-                         *.cxx \
-                         *.cpp \
-                         *.c++ \
-                         *.java \
-                         *.ii \
-                         *.ixx \
-                         *.ipp \
-                         *.i++ \
-                         *.inl \
-                         *.idl \
-                         *.ddl \
-                         *.odl \
-                         *.h \
-                         *.hh \
-                         *.hxx \
-                         *.hpp \
-                         *.h++ \
-                         *.cs \
-                         *.d \
-                         *.php \
-                         *.php4 \
-                         *.php5 \
-                         *.phtml \
-                         *.inc \
-                         *.m \
-                         *.markdown \
-                         *.md \
-                         *.mm \
-                         *.dox \
-                         *.py \
-                         *.tcl \
-                         *.vhd \
-                         *.vhdl \
-                         *.ucf \
-                         *.qsf \
-                         *.as \
-                         *.js
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = NO
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                =
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       = *
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE = ../README.md
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 1
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = YES
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             =
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = YES
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = YES
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = NO
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             = __attribute__(x)= \
-                         __inline= \
-                         HIPBLAS_EXPORT= \
-                         HIPBLAS_DEPRECATED_MSG(x)=
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = NO
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           =
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      =
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/transferbench.so/transferbench/docs/.sphinx/_toc.yml.in b/transferbench.so/transferbench/docs/.sphinx/_toc.yml.in
deleted file mode 100644
index 27694f35..00000000
--- a/transferbench.so/transferbench/docs/.sphinx/_toc.yml.in
+++ /dev/null
@@ -1,13 +0,0 @@
-# Anywhere {branch} is used, the branch name will be substituted.
-# These comments will also be removed.
-
-defaults:
-  numbered: False
-  maxdepth: 6
-
-root: index
-subtrees:
-  - entries:
-    - file: instructions
-    - file: examples/index
-    - file: api
diff --git a/transferbench.so/transferbench/docs/.sphinx/requirements.in b/transferbench.so/transferbench/docs/.sphinx/requirements.in
deleted file mode 100644
index b8366edf..00000000
--- a/transferbench.so/transferbench/docs/.sphinx/requirements.in
+++ /dev/null
@@ -1 +0,0 @@
-rocm-docs-core==0.11.0
diff --git a/transferbench.so/transferbench/docs/.sphinx/requirements.txt b/transferbench.so/transferbench/docs/.sphinx/requirements.txt
deleted file mode 100644
index 3c1eba2e..00000000
--- a/transferbench.so/transferbench/docs/.sphinx/requirements.txt
+++ /dev/null
@@ -1,155 +0,0 @@
-#
-# This file is autogenerated by pip-compile with Python 3.8
-# by the following command:
-#
-#    pip-compile requirements.in
-#
-accessible-pygments==0.0.3
-    # via pydata-sphinx-theme
-alabaster==0.7.13
-    # via sphinx
-babel==2.12.1
-    # via
-    #   pydata-sphinx-theme
-    #   sphinx
-beautifulsoup4==4.11.2
-    # via pydata-sphinx-theme
-breathe==4.34.0
-    # via rocm-docs-core
-certifi==2022.12.7
-    # via requests
-cffi==1.15.1
-    # via
-    #   cryptography
-    #   pynacl
-charset-normalizer==3.1.0
-    # via requests
-click==8.1.3
-    # via sphinx-external-toc
-cryptography==40.0.2
-    # via pyjwt
-deprecated==1.2.13
-    # via pygithub
-docutils==0.19
-    # via
-    #   breathe
-    #   myst-parser
-    #   pydata-sphinx-theme
-    #   sphinx
-fastjsonschema==2.16.3
-    # via rocm-docs-core
-gitdb==4.0.10
-    # via gitpython
-gitpython==3.1.31
-    # via rocm-docs-core
-idna==3.4
-    # via requests
-imagesize==1.4.1
-    # via sphinx
-importlib-metadata==6.0.0
-    # via sphinx
-importlib-resources==5.12.0
-    # via rocm-docs-core
-jinja2==3.1.2
-    # via
-    #   myst-parser
-    #   sphinx
-linkify-it-py==1.0.3
-    # via myst-parser
-markdown-it-py==2.2.0
-    # via
-    #   mdit-py-plugins
-    #   myst-parser
-markupsafe==2.1.2
-    # via jinja2
-mdit-py-plugins==0.3.5
-    # via myst-parser
-mdurl==0.1.2
-    # via markdown-it-py
-myst-parser[linkify]==1.0.0
-    # via rocm-docs-core
-packaging==23.0
-    # via
-    #   pydata-sphinx-theme
-    #   sphinx
-pycparser==2.21
-    # via cffi
-pydata-sphinx-theme==0.13.3
-    # via
-    #   rocm-docs-core
-    #   sphinx-book-theme
-pygithub==1.58.1
-    # via rocm-docs-core
-pygments==2.14.0
-    # via
-    #   accessible-pygments
-    #   pydata-sphinx-theme
-    #   sphinx
-pyjwt[crypto]==2.6.0
-    # via pygithub
-pynacl==1.5.0
-    # via pygithub
-pytz==2023.3
-    # via babel
-pyyaml==6.0
-    # via
-    #   myst-parser
-    #   rocm-docs-core
-    #   sphinx-external-toc
-requests==2.28.2
-    # via
-    #   pygithub
-    #   sphinx
-rocm-docs-core==0.11.0
-    # via -r requirements.in
-smmap==5.0.0
-    # via gitdb
-snowballstemmer==2.2.0
-    # via sphinx
-soupsieve==2.4
-    # via beautifulsoup4
-sphinx==5.3.0
-    # via
-    #   breathe
-    #   myst-parser
-    #   pydata-sphinx-theme
-    #   rocm-docs-core
-    #   sphinx-book-theme
-    #   sphinx-copybutton
-    #   sphinx-design
-    #   sphinx-external-toc
-    #   sphinx-notfound-page
-sphinx-book-theme==1.0.1
-    # via rocm-docs-core
-sphinx-copybutton==0.5.1
-    # via rocm-docs-core
-sphinx-design==0.4.1
-    # via rocm-docs-core
-sphinx-external-toc==0.3.1
-    # via rocm-docs-core
-sphinx-notfound-page==0.8.3
-    # via rocm-docs-core
-sphinxcontrib-applehelp==1.0.4
-    # via sphinx
-sphinxcontrib-devhelp==1.0.2
-    # via sphinx
-sphinxcontrib-htmlhelp==2.0.1
-    # via sphinx
-sphinxcontrib-jsmath==1.0.1
-    # via sphinx
-sphinxcontrib-qthelp==1.0.3
-    # via sphinx
-sphinxcontrib-serializinghtml==1.1.5
-    # via sphinx
-typing-extensions==4.5.0
-    # via pydata-sphinx-theme
-uc-micro-py==1.0.1
-    # via linkify-it-py
-urllib3==1.26.15
-    # via requests
-wrapt==1.15.0
-    # via deprecated
-zipp==3.15.0
-    # via
-    #   importlib-metadata
-    #   importlib-resources
diff --git a/transferbench.so/transferbench/docs/api.rst b/transferbench.so/transferbench/docs/api.rst
deleted file mode 100644
index 42357fdc..00000000
--- a/transferbench.so/transferbench/docs/api.rst
+++ /dev/null
@@ -1,5 +0,0 @@
------
-API
------
-
-.. doxygenindex::
diff --git a/transferbench.so/transferbench/docs/conf.py b/transferbench.so/transferbench/docs/conf.py
deleted file mode 100644
index 0eedf91c..00000000
--- a/transferbench.so/transferbench/docs/conf.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-from rocm_docs import ROCmDocs
-
-docs_core = ROCmDocs("TransferBench Documentation")
-docs_core.setup()
-docs_core.run_doxygen()
-
-for sphinx_var in ROCmDocs.SPHINX_VARS:
-    globals()[sphinx_var] = getattr(docs_core, sphinx_var)
diff --git a/transferbench.so/transferbench/docs/examples/configfile_format.rst b/transferbench.so/transferbench/docs/examples/configfile_format.rst
deleted file mode 100644
index ad751b28..00000000
--- a/transferbench.so/transferbench/docs/examples/configfile_format.rst
+++ /dev/null
@@ -1,93 +0,0 @@
---------------------
-ConfigFile Format
---------------------
-
-A Transfer is defined as a single operation where an Executor reads and adds together
-values from Source (SRC) memory locations, then writes the sum to destination (DST) memory locations.
-This simplifies to a simple copy operation when dealing with single SRC/DST.::
-
-   SRC 0                DST 0
-   SRC 1 -> Executor -> DST 1
-   SRC X                DST Y
-
-Three Executors are supported by TransferBench::
-
-   Executor:        SubExecutor:
-   1) CPU           CPU thread
-   2) GPU           GPU threadblock/Compute Unit (CU)
-   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
-
-Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
-
-There are two ways to specify a Test:
-
-1) Basic
-
-   The basic specification assumes the same number of SubExecutors (SE) used per Transfer
-   A positive number of Transfers is specified followed by that number of triplets describing each Transfer
-
-   Transfers SEs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
-
-2) Advanced
-
-   A negative number of Transfers is specified, followed by quintuplets describing each Transfer
-   A non-zero number of bytes specified will override any provided value
-
-   -Transfers (srcMem1->Executor1->dstMem1 SEs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL SEsL BytesL)
-
-Argument Details:::
-
-   Transfers:   Number of Transfers to be run in parallel
-   SEs      :   Number of SubExectors to use (CPU threads/ GPU threadblocks)
-   srcMemL   :   Source memory locations (Where the data is to be read from)
-   Executor  :   Executor is specified by a character indicating type, followed by device index (0-indexed)
-                  - C: CPU-executed  (Indexed from 0 to NUMA nodes - 1)
-                  - G: GPU-executed  (Indexed from 0 to GPUs - 1)
-                  - D: DMA-executor  (Indexed from 0 to GPUs - 1)
-   dstMemL   :   Destination memory locations (Where the data is to be written to)
-   bytesL    :   Number of bytes to copy (0 means use command-line specified size)
-                  Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
-
-                  Memory locations are specified by one or more (device character / device index) pairs
-                  Character indicating memory type followed by device index (0-indexed)
-                  Supported memory locations are:
-                  - C:    Pinned host memory       (on NUMA node, indexed from 0 to [NUMA nodes-1])
-                  - U:    Unpinned host memory     (on NUMA node, indexed from 0 to [NUMA nodes-1])
-                  - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [NUMA nodes-1])
-                  - G:    Global device memory     (on GPU device indexed from 0 to [GPUs - 1])
-                  - F:    Fine-grain device memory (on GPU device indexed from 0 to [GPUs - 1])
-                  - N:    Null memory              (index ignored)
-
-Examples:::
-
-   1 4 (G0->G0->G1)                   Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
-   1 4 (C1->G2->G0)                   Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
-   2 4 G0->G0->G1 G1->G1->G0          Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
-   -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs
-
-Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
-Lines starting with will be ignored. Lines starting with will be echoed to output
-
-Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs::
-
-   1 4 (G0->G0->G1)
-
-Single DMA executed Transfer between GPUs 0 and 1::
-
-   1 1 (G0->D0->G1)
-
-Copy 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs::
-
-   -2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)
-
-"Memset" by GPU 0 to GPU 0 memory::
-
-   1 32 (N0->G0->G0)
-
-"Read-only" by CPU 0::
-
-   1 4 (C0->C0->N0)
-
-Broadcast from GPU 0 to GPU 0 and GPU 1::
-
-   1 16 (G0->G0->G0G1)
diff --git a/transferbench.so/transferbench/docs/examples/index.rst b/transferbench.so/transferbench/docs/examples/index.rst
deleted file mode 100644
index f6712e63..00000000
--- a/transferbench.so/transferbench/docs/examples/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-.. _Examples:
-
---------------------
-Examples
---------------------
-
-.. toctree::
-   :maxdepth: 3
-   :caption: Contents:
-
-   configfile_format
diff --git a/transferbench.so/transferbench/docs/index.rst b/transferbench.so/transferbench/docs/index.rst
deleted file mode 100644
index 1a6aaa2b..00000000
--- a/transferbench.so/transferbench/docs/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-*******************************************
-Welcome to TransferBench's documentation!
-*******************************************
-TransferBench is a simple utility capable of benchmarking simultaneous transfers between user-specified devices (CPUs/GPUs).
-A Transfer is defined as a single operation where an executor reads and adds together values from source (SRC) memory locations, then writes the sum to destination (DST) memory locations. This simplifies to a simple copy operation when dealing with single SRC/DST.
-
-The user has control over the SRC and DST memory locations by indicating memory type followed by the device index. TransferBench supports coarse-grained pinned host memory, unpinned host memory, fine-grained host memory, coarse-grained global device memory, fine-grained global device memory, and null memory (for an empty transfer). In addition, the user can determine the size of the transfer (number of bytes to copy) for their tests.
-
-The executor of the transfer can also be specified by the user. The options are CPU, kernel-based GPU, and SDMA-based GPU (DMA) executors. TransferBench also provides the option to choose the number of sub-executors. In case of a CPU executor this argument specifies the number of CPU threads, while for a GPU executor it defines the number of compute units (CU). If DMA is specified as the executor, the sub-executor argument determines the number of streams to be used.
-
-For more examples, please refer to :ref:`Examples`
diff --git a/transferbench.so/transferbench/docs/instructions.rst b/transferbench.so/transferbench/docs/instructions.rst
deleted file mode 100644
index 7f8920e6..00000000
--- a/transferbench.so/transferbench/docs/instructions.rst
+++ /dev/null
@@ -1,36 +0,0 @@
--------------
-Requirements
--------------
-
-1. ROCm stack installed on the system (HIP runtime)
-2. libnuma installed on system
-
--------------
-Building
--------------
-
-To build TransferBench using Makefile:
-::
-
-    $ make
-
-To build TransferBench using cmake:
-::
-
-    $ mkdir build
-    $ cd build
-    $ CXX=/opt/rocm/bin/hipcc cmake ..
-    $ make
-
-If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
-
---------------------------
-NVIDIA platform support
---------------------------
-
-TransferBench may also be built to run on NVIDIA platforms via HIP, but requires a HIP-compatible CUDA version installed (e.g. CUDA 11.5)
-
-To build:
-::
-    
-   CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make`

From a989ce0882255d32fee85745be311041b12de9d9 Mon Sep 17 00:00:00 2001
From: Manoj S K <Manoj.SK@amd.com>
Date: Wed, 2 Aug 2023 09:01:53 +0000
Subject: [PATCH 3/7] more deletes

---
 transferbench.so/transferbench/CHANGELOG.md   | 199 ------------------
 transferbench.so/transferbench/LICENSE.md     |  19 --
 transferbench.so/transferbench/README.md      |  63 ------
 .../transferbench/examples/example.cfg        |  79 -------
 4 files changed, 360 deletions(-)
 delete mode 100644 transferbench.so/transferbench/CHANGELOG.md
 delete mode 100644 transferbench.so/transferbench/LICENSE.md
 delete mode 100644 transferbench.so/transferbench/README.md
 delete mode 100644 transferbench.so/transferbench/examples/example.cfg

diff --git a/transferbench.so/transferbench/CHANGELOG.md b/transferbench.so/transferbench/CHANGELOG.md
deleted file mode 100644
index 76932de4..00000000
--- a/transferbench.so/transferbench/CHANGELOG.md
+++ /dev/null
@@ -1,199 +0,0 @@
-# Changelog for TransferBench
-
-## v1.22
-### Modified
-- Switching kernel timing function to wall_clock64
-
-## v1.21
-### Fixed
-- Fixed bug with SAMPLING_FACTOR
-
-## v1.20
-### Fixed
-- VALIDATE_DIRECT can now be used with USE_PREP_KERNEL
-- Switch to local GPU for validating GPU memory
-
-## v1.19
-### Added
-- VALIDATE_DIRECT now also applies to source memory array checking
-- Adding null memory pointer check prior to deallocation
-
-## v1.18
-### Added
-- Adding ability to validate GPU destination memory directly without going through CPU staging buffer (VALIDATE_DIRECT)
-  - NOTE: This will only work on AMD devices with large-bar access enable and may slow things down considerably
-### Changed
-- Refactored how environment variables are displayed
-- Mismatch stops after first detected error within an array instead of list all mismatched elements
-
-## v1.17
-### Added
-- Allow switch to GFX kernel for source array initialization (USE_PREP_KERNEL)
-  - USE_PREP_KERNEL cannot be used with FILL_PATTERN
-- Adding ability to compile with nvcc only (TransferBenchCuda)
-### Changed
-- Default pattern set to [Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)]
-### Fixed
-- Re-adding example.cfg file
-
-## v1.16
-### Added
-- Additional src array validation during preparation
-- Adding new env var CONTINUE_ON_ERROR to resume tests after mis-match detection
-- Initializing GPU memory to 0 during allocation
-
-## v1.15
-### Fixed
-- Fixed a bug that prevented single Transfers > 8GB
-### Changed
-- Removed "check for latest ROCm" warning when allocating too much memory
-- Printing off source memory value as well when mis-match is detected
-
-## v1.14
-### Added
-- Added documentation
-- Added pthread linking in src/Makefile and CMakeLists.txt
-- Added printing off the hex value of the floats for output and reference
-
-## v1.13
-### Added
-- Added support for cmake
-
-### Changed
-- Converted to the Pitchfork layout standard
-
-## v1.12
-### Added
-- Added support for TransferBench on NVIDIA platforms (via HIP_PLATFORM=nvidia)
-  - CPU executors on NVIDIA platform cannot access GPU memory (no large-bar access)
-
-## v1.11
-### Added
-- New multi-input / multi-output support (MIMO).  Transfers now can reduce (element-wise summation) multiple input memory arrays
-  and write the sums to multiple outputs
-- New GPU-DMA executor 'D' (uses hipMemcpy for SDMA copies).  Previously this was done using USE_HIP_CALL, but now this allows
-  GPU-GFX kernel to run in parallel with GPU-DMA instead of applying to all GPU executors globally.
-  - GPU-DMA executor can only be used for single-input/single-output Transfers
-  - GPU-DMA executor can only be associated with one SubExecutor
-- Added new "Null" memory type 'N', which represents empty memory. This allows for read-only or write-only Transfers
-- Added new GPU_KERNEL environment variable that allows for switching between various GPU-GFX reduction kernels
-
-### Optimized
-- Slightly improved GPU-GFX kernel performance based on hardware architecture when running with fewer CUs
-
-### Changed
-- Updated the example.cfg file to cover the new features
-- Updated output to support MIMO
-- Changed CUs/CPUs threads naming to SubExecutors for consistency
-- Sweep Preset:
-  - Default sweep preset executors now includes DMA
-- P2P Benchmarks:
-  - Now only works via "p2p".  Removed "p2p_rr", "g2g" and "g2g_rr".
-    - Setting NUM_CPU_DEVICES=0 can be used to only benchmark GPU devices (like "g2g")
-    - New environment variable USE_REMOTE_READ replaces "_rr" presets
-  - New environment variable USE_GPU_DMA=1 replaces USE_HIP_CALL=1 for benchmarking with GPU-DMA Executor
-  - Number of GPU SubExecutors for benchmark can be specified via NUM_GPU_SE
-    - Defaults to all CUs for GPU-GFX, 1 for GPU-DMA
-  - Number of CPU SubExecutors for benchmark can be specified via NUM_CPU_SE
-- Psuedo-random input pattern has been slightly adjusted to have different patterns for each input array within same Transfer
-
-### Removed
-- USE_HIP_CALL has been removed.  Use GPU-DMA executor 'D' or set USE_GPU_DMA=1 for P2P benchmark presets
-  - Currently warning will be issued if USE_HIP_CALL is set to 1 and program will terminate
-- Removed NUM_CPU_PER_TRANSFER - The number of CPU SubExecutors will be whatever is specified for the Transfer
-- Removed USE_MEMSET environment variable.  This can now be done via a Transfer using the null memory type
-
-## v1.10
-### Fixed
-- Fix incorrect bandwidth calculation when using single stream mode and per-Transfer data sizes
-
-## v1.09
-### Added
-- Printing off src/dst memory addresses during interactive mode
-### Changed
-- Switching to numa_set_preferred instead of set_mempolicy
-
-## v1.08
-### Changed
-- Fixing handling of non-configured NUMA nodes
-- Topology detection now shows actual NUMA node indices
-- Fix for issue with NUM_GPU_DEVICES
-
-## v1.07
-### Changed
-- Fix bug with allocations involving non-default CPU memory types
-
-## v1.06
-### Added
-- Added unpinned CPU memory type ('U').  May require HSA_XNACK=1 in order to access via GPU executors
-- Adding logging of sweep configuration to lastSweep.cfg
-- Adding ability to specify number of CUs to use for sweep-based presets
-### Changed
-- Fixing random sweep repeatibility
-- Fixing bug with CPU NUMA node memory allocation
-- Modified advanced configuration file format to accept bytes per Transfer
-
-## v1.05
-### Added
-- Topology output now includes NUMA node information
-- Support for NUMA nodes with no CPU cores (e.g. CXL memory)
-### Removed
-- SWEEP_SRC_IS_EXE environment variable
-
-## v1.04
-### Added
-- New environment variables for sweep based presets
-  - SWEEP_XGMI_MIN   - Min number of XGMI hops for Transfers
-  - SWEEP_XGMI_MAX   - Max number of XGMI hops for Transfers
-  - SWEEP_SEED       - Random seed being used
-  - SWEEP_RAND_BYTES - Use random amount of bytes (up to pre-specified N) for each Transfer
-### Changed
-  - CSV output for sweep includes env vars section followed by output
-  - CSV output no longer lists env var parameters in columns
-  - Default number of warmup iterations changed from 3 to 1
-  - Splitting CSV output of link type to ExeToSrcLinkType and ExeToDstLinkType
-
-## v1.03
-### Added
-- New preset modes stress-test benchmarks "sweep" and "randomsweep"
-  - sweep iterates over all possible sets of Transfers to test
-  - randomsweep iterates over random sets of Transfers
-  -  New sweep-only environment variables can modify sweep
-     - SWEEP_SRC - String containing only "B","C","F", or "G", defining possible source memory types
-     - SWEEP_EXE - String containing only "C", or "G", defining possible executors
-     - SWEEP_DST - String containing only "B","C","F", or "G", defining possible destination memory types
-     - SWEEP_SRC_IS_EXE - Restrict executor to be the same as the source if non-zero
-     - SWEEP_MIN - Minimum number of parallel transfers to test
-     - SWEEP_MAX - Maximum number of parallel transfers to test
-     - SWEEP_COUNT - Maximum number of tests to run
-     - SWEEP_TIME_LIMIT - Maximum number of seconds to run tests for
-- New environment variable to restrict number of available GPUs to test on (primarily for sweep runs)
-  - NUM_CPU_DEVICES - Number of CPU devices
-  - NUM_GPU_DEVICES - Number of GPU devices
-### Changed
-- Fixed timing display for CPU-executors when using single stream mode
-
-## v1.02
-### Added
-- Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test
-### Changed
-- Copies are now refered to as Transfers instead of Links
-- Re-ordering how env vars are displayed (alphabetically now)
-### Removed
-- Combined timing is now always on for kernel-based GPU copies. COMBINED_TIMING env var has been removed
-- Use single sync is no longer supported to facility variable iterations. USE_SINGLE_SYNC env var has been removed
-
-## v1.01
-### Added
-- Adding USE_SINGLE_STREAM feature
-  - All Links that execute on the same GPU device are executed with a single kernel launch on a single stream
-  - Does not work with USE_HIP_CALL and forces USE_SINGLE_SYNC to collect timings
-  - Adding ability to request coherent / fine-grained host memory ('B')
-### Changed
-- Separating TransferBench from RCCL repo
-- Peer-to-peer benchmark mode now works OUTPUT_TO_CSV
-- Toplogy display now works with OUTPUT_TO_CSV
-- Moving documentation about config file into example.cfg
-### Removed
-- Removed config file generation
-- Removed show pointer address environment variable (SHOW_ADDR)
diff --git a/transferbench.so/transferbench/LICENSE.md b/transferbench.so/transferbench/LICENSE.md
deleted file mode 100644
index cf8533aa..00000000
--- a/transferbench.so/transferbench/LICENSE.md
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/transferbench.so/transferbench/README.md b/transferbench.so/transferbench/README.md
deleted file mode 100644
index b1155f78..00000000
--- a/transferbench.so/transferbench/README.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# TransferBench
-
-TransferBench is a simple utility capable of benchmarking simultaneous copies between user-specified devices (CPUs/GPUs).
-
-## Requirements
-
-1. ROCm stack installed on the system (HIP runtime)
-2. libnuma installed on system
-
-## Documentation
-
-Run the steps below to build documentation locally.
-
-```
-cd docs
-
-pip3 install -r .sphinx/requirements.txt
-
-python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
-```
-
-## Building
-  To build TransferBench using Makefile:
- ```shell
- $ make
- ```
-
-  To build TransferBench using cmake:
- ```shell
-$ mkdir build
-$ cd build
-$ CXX=/opt/rocm/bin/hipcc cmake ..
-$ make
- ```
-
-  If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
-
-## NVIDIA platform support
-
-TransferBench may also be built to run on NVIDIA platforms either via HIP, or native nvcc
-
-To build with HIP for NVIDIA (requires HIP-compatible CUDA version installed e.g. CUDA 11.5):
-```
-   CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make`
-```
-
-To build with native nvcc: (Builds TransferBenchCuda)
-```
-   make
-```
-
-## Hints and suggestions
-- Running TransferBench with no arguments will display usage instructions and detected topology information
-- There are several preset configurations that can be used instead of a configuration file
-  including:
-  - p2p    - Peer to peer benchmark test
-  - sweep  - Sweep across possible sets of Transfers
-  - rsweep - Random sweep across possible sets of Transfers
-- When using the same GPU executor in multiple simultaneous Transfers, performance may be
-  serialized due to the maximum number of hardware queues available.
-  - The number of maximum hardware queues can be adjusted via GPU_MAX_HW_QUEUES
-  - Alternatively, running in single stream mode (USE_SINGLE_STREAM=1) may avoid this issue
-    by launching all Transfers on a single stream instead of individual streams
diff --git a/transferbench.so/transferbench/examples/example.cfg b/transferbench.so/transferbench/examples/example.cfg
deleted file mode 100644
index 5f5d7cc8..00000000
--- a/transferbench.so/transferbench/examples/example.cfg
+++ /dev/null
@@ -1,79 +0,0 @@
-# ConfigFile Format:
-# ==================
-# A Transfer is defined as a single operation where an Executor reads and adds together
-# values from Source (SRC) memory locations, then writes the sum to destination (DST) memory locations.
-# This simplifies to a simple copy operation when dealing with single SRC/DST.
-#
-#                SRC 0                DST 0
-#                SRC 1 -> Executor -> DST 1
-#                SRC X                DST Y
-
-# Three Executors are supported by TransferBench
-#   Executor:        SubExecutor:
-#   1) CPU           CPU thread
-#   2) GPU           GPU threadblock/Compute Unit (CU)
-#   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
-
-# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
-
-# There are two ways to specify a Test:
-
-# 1) Basic
-#    The basic specification assumes the same number of SubExecutors (SE) used per Transfer
-#    A positive number of Transfers is specified followed by that number of triplets describing each Transfer
-
-#    #Transfers #SEs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
-
-# 2) Advanced
-#    A negative number of Transfers is specified, followed by quintuplets describing each Transfer
-#    A non-zero number of bytes specified will override any provided value
-#    -#Transfers (srcMem1->Executor1->dstMem1 #SEs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #SEsL BytesL)
-
-# Argument Details:
-#   #Transfers:   Number of Transfers to be run in parallel
-#   #SEs      :   Number of SubExectors to use (CPU threads/ GPU threadblocks)
-#   srcMemL   :   Source memory locations (Where the data is to be read from)
-#   Executor  :   Executor is specified by a character indicating type, followed by device index (0-indexed)
-#                 - C: CPU-executed  (Indexed from 0 to # NUMA nodes - 1)
-#                 - G: GPU-executed  (Indexed from 0 to # GPUs - 1)
-#                 - D: DMA-executor  (Indexed from 0 to # GPUs - 1)
-#   dstMemL   :   Destination memory locations (Where the data is to be written to)
-#   bytesL    :   Number of bytes to copy (0 means use command-line specified size)
-#                 Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
-#
-#                 Memory locations are specified by one or more (device character / device index) pairs
-#                 Character indicating memory type followed by device index (0-indexed)
-#                 Supported memory locations are:
-#                 - C:    Pinned host memory       (on NUMA node, indexed from 0 to [# NUMA nodes-1])
-#                 - U:    Unpinned host memory     (on NUMA node, indexed from 0 to [# NUMA nodes-1])
-#                 - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [# NUMA nodes-1])
-#                 - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
-#                 - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
-#                 - N:    Null memory              (index ignored)
-
-# Examples:
-# 1 4 (G0->G0->G1)                   Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
-# 1 4 (C1->G2->G0)                   Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
-# 2 4 G0->G0->G1 G1->G1->G0          Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
-# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs
-
-# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
-# Lines starting with # will be ignored. Lines starting with ## will be echoed to output
-
-## Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
-1 4 (G0->G0->G1)
-
-## Single DMA executed Transfer between GPUs 0 and 1
-1 1 (G0->D0->G1)
-
-## Copy 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs
--2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)
-
-## "Memset" by GPU 0 to GPU 0 memory
-1 32 (N0->G0->G0)
-
-## "Read-only" by CPU 0
-1 4 (C0->C0->N0)
-
-## Broadcast from GPU 0 to GPU 0 and GPU 1
-1 16 (G0->G0->G0G1)

From 4da6735b5e1289eabc784c3eb4ac8df508ce7d60 Mon Sep 17 00:00:00 2001
From: Manoj S K <Manoj.SK@amd.com>
Date: Wed, 2 Aug 2023 10:03:18 +0000
Subject: [PATCH 4/7] refactor and name changes

---
 transferbench.so/include/tfb_worker.h  |   10 +-
 transferbench.so/src/TransferBench.cpp | 1814 +++++++++++++++++++++++
 transferbench.so/src/action.cpp        |    2 +-
 transferbench.so/src/tfb_worker.cpp    | 1862 +-----------------------
 4 files changed, 1825 insertions(+), 1863 deletions(-)
 create mode 100644 transferbench.so/src/TransferBench.cpp

diff --git a/transferbench.so/include/tfb_worker.h b/transferbench.so/include/tfb_worker.h
index 6eeda99a..dba096ff 100644
--- a/transferbench.so/include/tfb_worker.h
+++ b/transferbench.so/include/tfb_worker.h
@@ -56,19 +56,17 @@ class tfbWorker : public rvs::ThreadBase {
     const std::string& get_name(void) { return action_name; }
 
     //! sets test path
-    void set_path(std::string pathname) { m_test_path = pathname; }
-    void set_args(std::string args) { m_test_args = args; }
+    void set_transfer_type(std::string type) { m_tfr_type = type; }
 
-    const std::string& get_path(void) { return m_test_path; }
-    bool start_hip_tests(int &error, std::string &errdesc);
+    const std::string& get_transfer_type(void) { return m_tfr_type; }
+    bool TfbRun(int &error, std::string &errdesc);
  protected:
     virtual void run(void);
  protected:
     //! name of the action
     std::string action_name;
     //! path to execute test
-    std::string m_test_path;
-    std::string m_test_args;
+    std::string m_tfr_type;
 };
 
 #endif  // HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
diff --git a/transferbench.so/src/TransferBench.cpp b/transferbench.so/src/TransferBench.cpp
new file mode 100644
index 00000000..51c7f68f
--- /dev/null
+++ b/transferbench.so/src/TransferBench.cpp
@@ -0,0 +1,1814 @@
+/*
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// This program measures simultaneous copy performance across multiple GPUs
+// on the same node
+#include <numa.h>
+#include <numaif.h>
+#include <random>
+#include <stack>
+#include <thread>
+
+#include "TransferBench.hpp"
+#include "GetClosestNumaNode.hpp"
+
+
+
+void ExecuteTransfers(EnvVars const& ev,
+                      int const testNum,
+                      size_t const N,
+                      std::vector<Transfer>& transfers,
+                      bool verbose)
+{
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  // Map transfers by executor
+  TransferMap transferMap;
+  for (Transfer& transfer : transfers)
+  {
+    Executor executor(transfer.exeType, transfer.exeIndex);
+    ExecutorInfo& executorInfo = transferMap[executor];
+    executorInfo.transfers.push_back(&transfer);
+  }
+
+  // Loop over each executor and prepare sub-executors
+  std::map<int, Transfer*> transferList;
+  for (auto& exeInfoPair : transferMap)
+  {
+    Executor const& executor = exeInfoPair.first;
+    ExecutorInfo& exeInfo    = exeInfoPair.second;
+    ExeType const exeType    = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
+
+    exeInfo.totalTime = 0.0;
+    exeInfo.totalSubExecs = 0;
+
+    // Loop over each transfer this executor is involved in
+    for (Transfer* transfer : exeInfo.transfers)
+    {
+      // Determine how many bytes to copy for this Transfer (use custom if pre-specified)
+      transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));
+
+      // Allocate source memory
+      transfer->srcMem.resize(transfer->numSrcs);
+      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
+      {
+        MemType const& srcType  = transfer->srcType[iSrc];
+        int     const  srcIndex    = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
+
+        // Ensure executing GPU can access source memory
+        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex)
+          EnablePeerAccess(exeIndex, srcIndex);
+
+        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
+      }
+
+      // Allocate destination memory
+      transfer->dstMem.resize(transfer->numDsts);
+      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
+      {
+        MemType const& dstType  = transfer->dstType[iDst];
+        int     const  dstIndex    = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
+
+        // Ensure executing GPU can access destination memory
+        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex)
+          EnablePeerAccess(exeIndex, dstIndex);
+
+        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
+      }
+
+      exeInfo.totalSubExecs += transfer->numSubExecs;
+      transferList[transfer->transferIndex] = transfer;
+    }
+
+    // Prepare additional requirement for GPU-based executors
+    if (IsGpuType(exeType))
+    {
+      HIP_CALL(hipSetDevice(exeIndex));
+
+      // Single-stream is only supported for GFX-based executors
+      int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
+      exeInfo.streams.resize(numStreamsToUse);
+      exeInfo.startEvents.resize(numStreamsToUse);
+      exeInfo.stopEvents.resize(numStreamsToUse);
+      for (int i = 0; i < numStreamsToUse; ++i)
+      {
+        HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
+      }
+
+      if (exeType == EXE_GPU_GFX)
+      {
+        // Allocate one contiguous chunk of GPU memory for threadblock parameters
+        // This allows support for executing one transfer per stream, or all transfers in a single stream
+        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
+                       (void**)&exeInfo.subExecParamGpu);
+      }
+    }
+  }
+
+  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
+
+  // Prepare input memory and block parameters for current N
+  bool isSrcCorrect = true;
+  for (auto& exeInfoPair : transferMap)
+  {
+    Executor const& executor = exeInfoPair.first;
+    ExecutorInfo& exeInfo    = exeInfoPair.second;
+    ExeType const exeType    = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
+
+    exeInfo.totalBytes = 0;
+
+    int transferOffset = 0;
+    for (int i = 0; i < exeInfo.transfers.size(); ++i)
+    {
+      // Prepare subarrays each threadblock works on and fill src memory with patterned data
+      Transfer* transfer = exeInfo.transfers[i];
+      transfer->PrepareSubExecParams(ev);
+      isSrcCorrect &= transfer->PrepareSrc(ev);
+      exeInfo.totalBytes += transfer->numBytesActual;
+
+      // Copy block parameters to GPU for GPU executors
+      if (transfer->exeType == EXE_GPU_GFX)
+      {
+        exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
+        HIP_CALL(hipSetDevice(exeIndex));
+        HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset],
+                           transfer->subExecParam.data(),
+                           transfer->subExecParam.size() * sizeof(SubExecParam),
+                           hipMemcpyHostToDevice));
+        HIP_CALL(hipDeviceSynchronize());
+
+        transferOffset += transfer->subExecParam.size();
+      }
+    }
+  }
+
+  // Launch kernels (warmup iterations are not counted)
+  double totalCpuTime = 0;
+  size_t numTimedIterations = 0;
+  std::stack<std::thread> threads;
+  for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
+  {
+    if (ev.numIterations > 0 && iteration    >= ev.numIterations) break;
+    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
+
+    // Pause before starting first timed iteration in interactive mode
+    if (verbose && ev.useInteractive && iteration == 0)
+    {
+      printf("Memory prepared:\n");
+
+      for (Transfer& transfer : transfers)
+      {
+        printf("Transfer %03d:\n", transfer.transferIndex);
+        for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
+          printf("  SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
+        for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
+          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
+      }
+      printf("Hit <Enter> to continue: ");
+      if (scanf("%*c") != 0)
+      {
+        printf("[ERROR] Unexpected input\n");
+        exit(1);
+      }
+      printf("\n");
+    }
+
+    // Start CPU timing for this iteration
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Execute all Transfers in parallel
+    for (auto& exeInfoPair : transferMap)
+    {
+      ExecutorInfo& exeInfo = exeInfoPair.second;
+      ExeType       exeType = exeInfoPair.first.first;
+      int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();
+
+      for (int i = 0; i < numTransfersToRun; ++i)
+        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
+    }
+
+    // Wait for all threads to finish
+    int const numTransfers = threads.size();
+    for (int i = 0; i < numTransfers; i++)
+    {
+      threads.top().join();
+      threads.pop();
+    }
+
+    // Stop CPU timing for this iteration
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+
+    if (iteration >= 0)
+    {
+      ++numTimedIterations;
+      totalCpuTime += deltaSec;
+    }
+  }
+
+  // Pause for interactive mode
+  if (verbose && isSrcCorrect && ev.useInteractive)
+  {
+    printf("Transfers complete. Hit <Enter> to continue: ");
+    if (scanf("%*c") != 0)
+    {
+      printf("[ERROR] Unexpected input\n");
+      exit(1);
+    }
+    printf("\n");
+  }
+
+  // Validate that each transfer has transferred correctly
+  size_t totalBytesTransferred = 0;
+  int const numTransfers = transferList.size();
+  for (auto transferPair : transferList)
+  {
+    Transfer* transfer = transferPair.second;
+    transfer->ValidateDst(ev);
+    totalBytesTransferred += transfer->numBytesActual;
+  }
+
+  // Report timings
+  totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
+  double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
+  double maxGpuTime = 0;
+
+  if (!isSrcCorrect) goto cleanup;
+  if (ev.useSingleStream)
+  {
+    for (auto& exeInfoPair : transferMap)
+    {
+      ExecutorInfo  exeInfo  = exeInfoPair.second;
+      ExeType const exeType  = exeInfoPair.first.first;
+      int     const exeIndex = exeInfoPair.first.second;
+
+      // Compute total time for non GPU executors
+      if (exeType != EXE_GPU_GFX)
+      {
+        exeInfo.totalTime = 0;
+        for (auto const& transfer : exeInfo.transfers)
+          exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
+      }
+
+      double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
+      double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
+
+      if (verbose && !ev.outputToCsv)
+      {
+        printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n",
+               ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
+      }
+
+      int totalCUs = 0;
+      for (auto const& transfer : exeInfo.transfers)
+      {
+        double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+        double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+        totalCUs += transfer->numSubExecs;
+
+        if (!verbose) continue;
+        if (!ev.outputToCsv)
+        {
+          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+                 transfer->transferIndex,
+                 transferBandwidthGbs,
+                 transferDurationMsec,
+                 transfer->numBytesActual,
+                 transfer->SrcToStr().c_str(),
+                 ExeTypeName[transfer->exeType], transfer->exeIndex,
+                 transfer->numSubExecs,
+                 transfer->DstToStr().c_str());
+        }
+        else
+        {
+          printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+                 testNum, transfer->transferIndex, transfer->numBytesActual,
+                 transfer->SrcToStr().c_str(),
+                 MemTypeStr[transfer->exeType], transfer->exeIndex,
+                 transfer->DstToStr().c_str(),
+                 transfer->numSubExecs,
+                 transferBandwidthGbs, transferDurationMsec,
+                 PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
+                 PtrVectorToStr(transfer->dstMem, initOffset).c_str());
+        }
+      }
+
+      if (verbose && ev.outputToCsv)
+      {
+        printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n",
+               testNum, totalBytesTransferred,
+               MemTypeStr[exeType], exeIndex, totalCUs,
+               exeBandwidthGbs, exeDurationMsec);
+      }
+    }
+  }
+  else
+  {
+    for (auto const& transferPair : transferList)
+    {
+      Transfer* transfer = transferPair.second;
+      double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+      double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
+      if (!verbose) continue;
+      if (!ev.outputToCsv)
+      {
+        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+               transfer->transferIndex,
+               transferBandwidthGbs, transferDurationMsec,
+               transfer->numBytesActual,
+               transfer->SrcToStr().c_str(),
+               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               transfer->numSubExecs,
+               transfer->DstToStr().c_str());
+      }
+      else
+      {
+        printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+               testNum, transfer->transferIndex, transfer->numBytesActual,
+               transfer->SrcToStr().c_str(),
+               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               transfer->DstToStr().c_str(),
+               transfer->numSubExecs,
+               transferBandwidthGbs, transferDurationMsec,
+               PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
+               PtrVectorToStr(transfer->dstMem, initOffset).c_str());
+      }
+    }
+  }
+
+  // Display aggregate statistics
+  if (verbose)
+  {
+    if (!ev.outputToCsv)
+    {
+      printf(" Aggregate (CPU)  | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
+             totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
+    }
+    else
+    {
+      printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n",
+             testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
+    }
+  }
+
+  // Release GPU memory
+cleanup:
+  for (auto exeInfoPair : transferMap)
+  {
+    ExecutorInfo& exeInfo  = exeInfoPair.second;
+    ExeType const exeType  = exeInfoPair.first.first;
+    int     const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));
+
+    for (auto& transfer : exeInfo.transfers)
+    {
+      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
+      {
+        MemType const& srcType = transfer->srcType[iSrc];
+        DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
+      }
+      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
+      {
+        MemType const& dstType = transfer->dstType[iDst];
+        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
+      }
+      transfer->subExecParam.clear();
+    }
+
+    if (IsGpuType(exeType))
+    {
+      int const numStreams = (int)exeInfo.streams.size();
+      for (int i = 0; i < numStreams; ++i)
+      {
+        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
+        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
+        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
+      }
+
+      if (exeType == EXE_GPU_GFX)
+      {
+        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
+      }
+    }
+  }
+}
+
+void DisplayUsage(char const* cmdName)
+{
+  printf("TransferBench v%s\n", TB_VERSION);
+  printf("========================================\n");
+
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+  int const numCpuDevices = numa_num_configured_nodes();
+
+  printf("Usage: %s config <N>\n", cmdName);
+  printf("  config: Either:\n");
+  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
+  printf("          - Name of preset config:\n");
+  printf("              p2p          - Peer-to-peer benchmark tests\n");
+  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
+  printf("                             - 3rd/4th optional args for # GPU SubExecs / # CPU SubExecs per Transfer\n");
+  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
+  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
+         DEFAULT_BYTES_PER_TRANSFER);
+  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
+  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
+  printf("\n");
+
+  EnvVars::DisplayUsage();
+}
+
+int RemappedIndex(int const origIdx, bool const isCpuType)
+{
+  static std::vector<int> remappingCpu;
+  static std::vector<int> remappingGpu;
+
+  // Build CPU remapping on first use
+  // Skip numa nodes that are not configured
+  if (remappingCpu.empty())
+  {
+    for (int node = 0; node <= numa_max_node(); node++)
+      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
+        remappingCpu.push_back(node);
+  }
+
+  // Build remappingGpu on first use
+  if (remappingGpu.empty())
+  {
+    int numGpuDevices;
+    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+    remappingGpu.resize(numGpuDevices);
+
+    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
+    if (!usePcieIndexing)
+    {
+      // For HIP-based indexing no remappingGpu is necessary
+      for (int i = 0; i < numGpuDevices; ++i)
+        remappingGpu[i] = i;
+    }
+    else
+    {
+      // Collect PCIe address for each GPU
+      std::vector<std::pair<std::string, int>> mapping;
+      char pciBusId[20];
+      for (int i = 0; i < numGpuDevices; ++i)
+      {
+        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
+        mapping.push_back(std::make_pair(pciBusId, i));
+      }
+      // Sort GPUs by PCIe address then use that as mapping
+      std::sort(mapping.begin(), mapping.end());
+      for (int i = 0; i < numGpuDevices; ++i)
+        remappingGpu[i] = mapping[i].second;
+    }
+  }
+  return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
+}
+
+void DisplayTopology(bool const outputToCsv)
+{
+
+  int numCpuDevices = numa_num_configured_nodes();
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+
+  if (outputToCsv)
+  {
+    printf("NumCpus,%d\n", numCpuDevices);
+    printf("NumGpus,%d\n", numGpuDevices);
+  }
+  else
+  {
+    printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total]   %d GPU device(s)\n",
+           numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
+  }
+
+  // Print out detected CPU topology
+  if (outputToCsv)
+  {
+    printf("NUMA");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf(",NUMA%02d", j);
+    printf(",# CPUs,ClosestGPUs,ActualNode\n");
+  }
+  else
+  {
+    printf("            |");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf("NUMA %02d|", j);
+    printf(" #Cpus | Closest GPU(s)\n");
+
+    printf("------------+");
+    for (int j = 0; j <= numCpuDevices; j++)
+      printf("-------+");
+    printf("---------------\n");
+  }
+
+  for (int i = 0; i < numCpuDevices; i++)
+  {
+    int nodeI = RemappedIndex(i, true);
+    printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
+    for (int j = 0; j < numCpuDevices; j++)
+    {
+      int nodeJ = RemappedIndex(j, true);
+      int numaDist = numa_distance(nodeI, nodeJ);
+      if (outputToCsv)
+        printf("%d,", numaDist);
+      else
+        printf(" %5d |", numaDist);
+    }
+
+    int numCpus = 0;
+    for (int j = 0; j < numa_num_configured_cpus(); j++)
+      if (numa_node_of_cpu(j) == nodeI) numCpus++;
+    if (outputToCsv)
+      printf("%d,", numCpus);
+    else
+      printf(" %5d | ", numCpus);
+
+#if !defined(__NVCC__)
+    bool isFirst = true;
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
+      {
+        if (isFirst) isFirst = false;
+        else printf(",");
+        printf("%d", j);
+      }
+    }
+#endif
+    printf("\n");
+  }
+  printf("\n");
+
+#if defined(__NVCC__)
+  // No further topology detection done for NVIDIA platforms
+  return;
+#endif
+
+  // Print out detected GPU topology
+  if (outputToCsv)
+  {
+    printf("GPU");
+    for (int j = 0; j < numGpuDevices; j++)
+      printf(",GPU %02d", j);
+    printf(",PCIe Bus ID,ClosestNUMA\n");
+  }
+  else
+  {
+    printf("        |");
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, j));
+      std::string fullName = prop.gcnArchName;
+      std::string archName = fullName.substr(0, fullName.find(':'));
+      printf(" %6s |", archName.c_str());
+    }
+    printf("\n");
+    printf("        |");
+    for (int j = 0; j < numGpuDevices; j++)
+      printf(" GPU %02d |", j);
+    printf(" PCIe Bus ID  | #CUs | Closest NUMA\n");
+    for (int j = 0; j <= numGpuDevices; j++)
+      printf("--------+");
+    printf("--------------+------+-------------\n");
+  }
+
+#if !defined(__NVCC__)
+  char pciBusId[20];
+  for (int i = 0; i < numGpuDevices; i++)
+  {
+    int const deviceIdx = RemappedIndex(i, false);
+    printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (i == j)
+      {
+        if (outputToCsv)
+          printf("-,");
+        else
+          printf("    -   |");
+      }
+      else
+      {
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
+                                              RemappedIndex(j, false),
+                                              &linkType, &hopCount));
+        printf("%s%s-%d%s",
+               outputToCsv ? "" : " ",
+               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
+               hopCount, outputToCsv ? "," : " |");
+      }
+    }
+    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));
+
+    int numDeviceCUs = 0;
+    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
+
+    if (outputToCsv)
+      printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+    else
+      printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+  }
+#endif
+}
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
+                  std::vector<MemType>& memTypes, std::vector<int>& memIndices)
+{
+  char typeChar;
+  int offset = 0, devIndex, inc;
+  bool found = false;
+
+  memTypes.clear();
+  memIndices.clear();
+  while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
+  {
+    offset += inc;
+    MemType memType = CharToMemType(typeChar);
+
+    if (IsCpuType(memType) && (devIndex < 0 || devIndex >= numCpus))
+    {
+      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, devIndex);
+      exit(1);
+    }
+    if (IsGpuType(memType) && (devIndex < 0 || devIndex >= numGpus))
+    {
+      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, devIndex);
+      exit(1);
+    }
+
+    found = true;
+    if (memType != MEM_NULL)
+    {
+      memTypes.push_back(memType);
+      memIndices.push_back(devIndex);
+    }
+  }
+  if (!found)
+  {
+    printf("[ERROR] Unable to parse memory type token %s.  Expected one of %s followed by an index\n",
+           token.c_str(), MemTypeStr);
+    exit(1);
+  }
+}
+
+void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
+                  ExeType &exeType, int& exeIndex)
+{
+  char typeChar;
+  if (sscanf(token.c_str(), " %c%d", &typeChar, &exeIndex) != 2)
+  {
+    printf("[ERROR] Unable to parse valid executor token (%s).  Exepected one of %s followed by an index\n",
+           token.c_str(), ExeTypeStr);
+    exit(1);
+  }
+  exeType = CharToExeType(typeChar);
+
+  if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= numCpus))
+  {
+    printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, exeIndex);
+    exit(1);
+  }
+  if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= numGpus))
+  {
+    printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, exeIndex);
+    exit(1);
+  }
+}
+
+// Helper function to parse a list of Transfer definitions
+void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
+{
+  // Replace any round brackets or '->' with spaces,
+  for (int i = 1; line[i]; i++)
+    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
+
+  transfers.clear();
+
+  int numTransfers = 0;
+  std::istringstream iss(line);
+  iss >> numTransfers;
+  if (iss.fail()) return;
+
+  std::string exeMem;
+  std::string srcMem;
+  std::string dstMem;
+
+  // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
+  // otherwise read triples (srcMem, exeMem, dstMem)
+  bool const advancedMode = (numTransfers < 0);
+  numTransfers = abs(numTransfers);
+
+  int numSubExecs;
+  if (!advancedMode)
+  {
+    iss >> numSubExecs;
+    if (numSubExecs <= 0 || iss.fail())
+    {
+      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs);
+      exit(1);
+    }
+  }
+
+  size_t numBytes = 0;
+  for (int i = 0; i < numTransfers; i++)
+  {
+    Transfer transfer;
+    transfer.transferIndex = i;
+    transfer.numBytes = 0;
+    transfer.numBytesActual = 0;
+    if (!advancedMode)
+    {
+      iss >> srcMem >> exeMem >> dstMem;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
+        exit(1);
+      }
+    }
+    else
+    {
+      std::string numBytesToken;
+      iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
+        exit(1);
+      }
+      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
+      {
+        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
+        exit(1);
+      }
+      char units = numBytesToken.back();
+      switch (toupper(units))
+      {
+      case 'K': numBytes *= 1024; break;
+      case 'M': numBytes *= 1024*1024; break;
+      case 'G': numBytes *= 1024*1024*1024; break;
+      }
+    }
+
+    ParseMemType(srcMem, numCpus, numGpus, transfer.srcType, transfer.srcIndex);
+    ParseMemType(dstMem, numCpus, numGpus, transfer.dstType, transfer.dstIndex);
+    ParseExeType(exeMem, numCpus, numGpus, transfer.exeType, transfer.exeIndex);
+
+    transfer.numSrcs = (int)transfer.srcType.size();
+    transfer.numDsts = (int)transfer.dstType.size();
+    if (transfer.numSrcs == 0 && transfer.numDsts == 0)
+    {
+      printf("[ERROR] Transfer must have at least one src or dst\n");
+      exit(1);
+    }
+
+    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
+    {
+      printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
+      exit(1);
+    }
+
+    transfer.numSubExecs = numSubExecs;
+    transfer.numBytes = numBytes;
+    transfers.push_back(transfer);
+  }
+}
+
+void EnablePeerAccess(int const deviceId, int const peerDeviceId)
+{
+  int canAccess;
+  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
+  if (!canAccess)
+  {
+    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
+    exit(1);
+  }
+  HIP_CALL(hipSetDevice(deviceId));
+  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
+  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
+  {
+    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
+           deviceId, peerDeviceId, hipGetErrorString(error));
+    exit(1);
+  }
+}
+
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
+{
+  if (numBytes == 0)
+  {
+    printf("[ERROR] Unable to allocate 0 bytes\n");
+    exit(1);
+  }
+  *memPtr = nullptr;
+  if (IsCpuType(memType))
+  {
+    // Set numa policy prior to call to hipHostMalloc
+    numa_set_preferred(devIndex);
+
+    // Allocate host-pinned memory (should respect NUMA mem policy)
+    if (memType == MEM_CPU_FINE)
+    {
+#if defined (__NVCC__)
+      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
+      exit(1);
+#else
+      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+#endif
+    }
+    else if (memType == MEM_CPU)
+    {
+#if defined (__NVCC__)
+      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
+#else
+      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
+#endif
+      {
+        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
+        exit(1);
+      }
+    }
+    else if (memType == MEM_CPU_UNPINNED)
+    {
+      *memPtr = numa_alloc_onnode(numBytes, devIndex);
+    }
+
+    // Check that the allocated pages are actually on the correct NUMA node
+    memset(*memPtr, 0, numBytes);
+    CheckPages((char*)*memPtr, numBytes, devIndex);
+
+    // Reset to default numa mem policy
+    numa_set_preferred(-1);
+  }
+  else if (IsGpuType(memType))
+  {
+    if (memType == MEM_GPU)
+    {
+      // Allocate GPU memory on appropriate device
+      HIP_CALL(hipSetDevice(devIndex));
+      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
+    }
+    else if (memType == MEM_GPU_FINE)
+    {
+#if defined (__NVCC__)
+      printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
+      exit(1);
+#else
+      HIP_CALL(hipSetDevice(devIndex));
+      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
+
+#endif
+    }
+    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
+    HIP_CALL(hipDeviceSynchronize());
+  }
+  else
+  {
+    printf("[ERROR] Unsupported memory type %d\n", memType);
+    exit(1);
+  }
+}
+
+void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
+{
+  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null CPU pointer for %lu bytes.  Skipping hipHostFree\n", bytes);
+      return;
+    }
+    HIP_CALL(hipHostFree(memPtr));
+  }
+  else if (memType == MEM_CPU_UNPINNED)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes.  Skipping numa_free\n", bytes);
+      return;
+    }
+    numa_free(memPtr, bytes);
+  }
+  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
+      return;
+    }
+    HIP_CALL(hipFree(memPtr));
+  }
+}
+
+void CheckPages(char* array, size_t numBytes, int targetId)
+{
+  unsigned long const pageSize = getpagesize();
+  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
+
+  std::vector<void *> pages(numPages);
+  std::vector<int> status(numPages);
+
+  pages[0] = array;
+  for (int i = 1; i < numPages; i++)
+  {
+    pages[i] = (char*)pages[i-1] + pageSize;
+  }
+
+  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+  if (retCode)
+  {
+    printf("[ERROR] Unable to collect page info\n");
+    exit(1);
+  }
+
+  size_t mistakeCount = 0;
+  for (int i = 0; i < numPages; i++)
+  {
+    if (status[i] < 0)
+    {
+      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
+      exit(1);
+    }
+    if (status[i] != targetId) mistakeCount++;
+  }
+  if (mistakeCount > 0)
+  {
+    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
+    exit(1);
+  }
+}
+
+void RunTransfer(EnvVars const& ev, int const iteration,
+                 ExecutorInfo& exeInfo, int const transferIdx)
+{
+  Transfer* transfer = exeInfo.transfers[transferIdx];
+
+  if (transfer->exeType == EXE_GPU_GFX)
+  {
+    // Switch to executing GPU
+    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
+    HIP_CALL(hipSetDevice(exeIndex));
+
+    hipStream_t& stream     = exeInfo.streams[transferIdx];
+    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
+
+    // Figure out how many threadblocks to use.
+    // In single stream mode, all the threadblocks for this GPU are launched
+    // Otherwise, just launch the threadblocks associated with this single Transfer
+    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
+#if defined(__NVCC__)
+    HIP_CALL(hipEventRecord(startEvent, stream));
+    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
+    HIP_CALL(hipEventRecord(stopEvent, stream));
+#else
+    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
+                          dim3(numBlocksToRun, 1, 1),
+                          dim3(BLOCKSIZE, 1, 1),
+                          ev.sharedMemBytes, stream,
+                          startEvent, stopEvent,
+                          0, transfer->subExecParamGpuPtr);
+#endif
+    // Synchronize per iteration, unless in single sync mode, in which case
+    // synchronize during last warmup / last actual iteration
+    HIP_CALL(hipStreamSynchronize(stream));
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      float gpuDeltaMsec;
+      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+
+      if (ev.useSingleStream)
+      {
+        // Figure out individual timings for Transfers that were all launched together
+        for (Transfer* currTransfer : exeInfo.transfers)
+        {
+          long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle;
+          long long maxStopCycle  = currTransfer->subExecParamGpuPtr[0].stopCycle;
+          for (int i = 1; i < currTransfer->numSubExecs; i++)
+          {
+            minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle);
+            maxStopCycle  = std::max(maxStopCycle,  currTransfer->subExecParamGpuPtr[i].stopCycle);
+          }
+          int const wallClockRate = GetWallClockRate(exeIndex);
+          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
+          currTransfer->transferTime += iterationTimeMs;
+        }
+        exeInfo.totalTime += gpuDeltaMsec;
+      }
+      else
+      {
+        transfer->transferTime += gpuDeltaMsec;
+      }
+    }
+  }
+  else if (transfer->exeType == EXE_GPU_DMA)
+  {
+    // Switch to executing GPU
+    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
+    HIP_CALL(hipSetDevice(exeIndex));
+
+    hipStream_t& stream     = exeInfo.streams[transferIdx];
+    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
+
+    HIP_CALL(hipEventRecord(startEvent, stream));
+    if (transfer->numSrcs == 0 && transfer->numDsts == 1)
+    {
+      HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
+                              MEMSET_CHAR, transfer->numBytesActual, stream));
+    }
+    else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
+    {
+      HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
+                              transfer->numBytesActual, hipMemcpyDefault,
+                              stream));
+    }
+    HIP_CALL(hipEventRecord(stopEvent, stream));
+    HIP_CALL(hipStreamSynchronize(stream));
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      float gpuDeltaMsec;
+      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+      transfer->transferTime += gpuDeltaMsec;
+    }
+  }
+  else if (transfer->exeType == EXE_CPU) // CPU execution agent
+  {
+    // Force this thread and all child threads onto correct NUMA node
+    int const exeIndex = RemappedIndex(transfer->exeIndex, true);
+    if (numa_run_on_node(exeIndex))
+    {
+      printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
+      exit(1);
+    }
+
+    std::vector<std::thread> childThreads;
+
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Launch each subExecutor in child-threads to perform memcopies
+    for (int i = 0; i < transfer->numSubExecs; ++i)
+      childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
+
+    // Wait for child-threads to finish
+    for (int i = 0; i < transfer->numSubExecs; ++i)
+      childThreads[i].join();
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+
+    // Record time if not a warmup iteration
+    if (iteration >= 0)
+      transfer->transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+  }
+}
+
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
+{
+  ev.DisplayP2PBenchmarkEnvVars();
+
+  // Collect the number of available CPUs/GPUs on this machine
+  int const numCpus    = ev.numCpuDevices;
+  int const numGpus    = ev.numGpuDevices;
+  int const numDevices = numCpus + numGpus;
+
+  // Enable peer to peer for each GPU
+  for (int i = 0; i < numGpus; i++)
+    for (int j = 0; j < numGpus; j++)
+      if (i != j) EnablePeerAccess(i, j);
+
+  // Perform unidirectional / bidirectional
+  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
+  {
+    // Print header
+    if (!ev.outputToCsv)
+    {
+      printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
+             ev.useRemoteRead ? "Remote" : "Local",
+             ev.useRemoteRead ? "Local" : "Remote",
+             ev.useDmaCopy    ? "DMA"   : "GFX");
+
+      if (isBidirectional)
+      {
+        printf("%12s", "SRC\\DST");
+      }
+      else
+      {
+        if (ev.useRemoteRead)
+          printf("%12s", "SRC\\EXE+DST");
+        else
+          printf("%12s", "SRC+EXE\\DST");
+      }
+      for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i);
+      for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i);
+      printf("\n");
+    }
+
+    // Loop over all possible src/dst pairs
+    for (int src = 0; src < numDevices; src++)
+    {
+      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
+      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
+
+      if (!ev.outputToCsv)
+        printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
+
+      for (int dst = 0; dst < numDevices; dst++)
+      {
+        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
+        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
+
+        double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex);
+        if (!ev.outputToCsv)
+        {
+          if (bandwidth == 0)
+            printf("%10s", "N/A");
+          else
+            printf("%10.2f", bandwidth);
+        }
+        else
+        {
+          printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n",
+                 srcType == MEM_CPU ? "CPU" : "GPU", srcIndex,
+                 dstType == MEM_CPU ? "CPU" : "GPU", dstIndex,
+                 isBidirectional ? "bidirectional" : "unidirectional",
+                 ev.useRemoteRead ? "Remote" : "Local",
+                 ev.useDmaCopy ? "DMA" : "GFX",
+                 bandwidth,
+                 N * sizeof(float));
+        }
+        fflush(stdout);
+      }
+      if (!ev.outputToCsv) printf("\n");
+    }
+    if (!ev.outputToCsv) printf("\n");
+  }
+}
+
+double GetPeakBandwidth(EnvVars const& ev, size_t const N,
+                        int     const  isBidirectional,
+                        MemType const  srcType, int const srcIndex,
+                        MemType const  dstType, int const dstIndex)
+{
+  // Skip bidirectional on same device
+  if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
+
+  // Prepare Transfers
+  std::vector<Transfer> transfers(2);
+  transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
+
+  // SRC -> DST
+  transfers[0].numSrcs = transfers[0].numDsts = 1;
+  transfers[0].srcType.push_back(srcType);
+  transfers[0].dstType.push_back(dstType);
+  transfers[0].srcIndex.push_back(srcIndex);
+  transfers[0].dstIndex.push_back(dstIndex);
+
+  // DST -> SRC
+  transfers[1].numSrcs = transfers[1].numDsts = 1;
+  transfers[1].srcType.push_back(dstType);
+  transfers[1].dstType.push_back(srcType);
+  transfers[1].srcIndex.push_back(dstIndex);
+  transfers[1].dstIndex.push_back(srcIndex);
+
+  // Either perform (local read + remote write), or (remote read + local write)
+  ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+  transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
+  transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
+  transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
+  transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
+  transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+  transfers[1].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+
+  // Remove (DST->SRC) if not bidirectional
+  transfers.resize(isBidirectional + 1);
+
+  // Abort if executing on NUMA node with no CPUs
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
+      return 0;
+
+#if defined(__NVCC__)
+    // NVIDIA platform cannot access GPU memory directly from CPU executors
+    if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
+        return 0;
+#endif
+  }
+
+  ExecuteTransfers(ev, 0, N, transfers, false);
+
+  // Collect aggregate bandwidth
+  double totalBandwidth = 0;
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations);
+    double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+    totalBandwidth += transferBandwidthGbs;
+  }
+  return totalBandwidth;
+}
+
+void Transfer::PrepareSubExecParams(EnvVars const& ev)
+{
+  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
+  // Figure out the sub-array each subExecutor works on for this Transfer
+  // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
+  //   except the very last one, for alignment reasons
+  size_t const N              = this->numBytesActual / sizeof(float);
+  int    const initOffset     = ev.byteOffset / sizeof(float);
+  int    const targetMultiple = ev.blockBytes / sizeof(float);
+
+  // In some cases, there may not be enough data for all subExectors
+  int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
+  this->subExecParam.clear();
+  this->subExecParam.resize(this->numSubExecs);
+
+  size_t assigned = 0;
+  for (int i = 0; i < this->numSubExecs; ++i)
+  {
+    int    const subExecLeft = std::max(0, maxSubExecToUse - i);
+    size_t const leftover    = N - assigned;
+    size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
+
+    SubExecParam& p = this->subExecParam[i];
+    p.N             = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
+    p.numSrcs       = this->numSrcs;
+    p.numDsts       = this->numDsts;
+    for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc)
+      p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset;
+    for (int iDst = 0; iDst < this->numDsts; ++iDst)
+      p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
+
+    if (ev.enableDebug)
+    {
+      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
+             this->transferIndex, i, p.N, assigned, assigned + p.N);
+    }
+
+    p.startCycle = 0;
+    p.stopCycle  = 0;
+    assigned += p.N;
+  }
+
+  this->transferTime = 0.0;
+}
+
+void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
+{
+  size_t N = buffer.size();
+  if (bufferIdx >= 0)
+  {
+    size_t patternLen = ev.fillPattern.size();
+    if (patternLen > 0)
+    {
+      for (size_t i = 0; i < N; ++i)
+        buffer[i] = ev.fillPattern[i % patternLen];
+    }
+    else
+    {
+      for (size_t i = 0; i < N; ++i)
+        buffer[i] = PrepSrcValue(bufferIdx, i);
+    }
+  }
+  else // Destination buffer
+  {
+    if (this->numSrcs == 0)
+    {
+      // Note: 0x75757575 = 13323083.0
+      memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
+    }
+    else
+    {
+      PrepareReference(ev, buffer, 0);
+
+      if (this->numSrcs > 1)
+      {
+        std::vector<float> temp(N);
+        for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
+        {
+          PrepareReference(ev, temp, srcIdx);
+          for (int i = 0; i < N; ++i)
+          {
+            buffer[i] += temp[i];
+          }
+        }
+      }
+    }
+  }
+}
+
+bool Transfer::PrepareSrc(EnvVars const& ev)
+{
+  if (this->numSrcs == 0) return true;
+  size_t const N = this->numBytesActual / sizeof(float);
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  std::vector<float> reference(N);
+  for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
+  {
+    float* srcPtr = this->srcMem[srcIdx] + initOffset;
+    PrepareReference(ev, reference, srcIdx);
+
+    // Initialize source memory array with reference pattern
+    if (IsGpuType(this->srcType[srcIdx]))
+    {
+      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
+      HIP_CALL(hipSetDevice(deviceIdx));
+      if (ev.usePrepSrcKernel)
+        PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx);
+      else
+        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
+      HIP_CALL(hipDeviceSynchronize());
+    }
+    else if (IsCpuType(this->srcType[srcIdx]))
+    {
+      memcpy(srcPtr, reference.data(), this->numBytesActual);
+    }
+
+    // Perform check just to make sure that data has been copied properly
+    float* srcCheckPtr = srcPtr;
+    std::vector<float> srcCopy(N);
+    if (IsGpuType(this->srcType[srcIdx]))
+    {
+      if (!ev.validateDirect)
+      {
+        HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
+        HIP_CALL(hipDeviceSynchronize());
+        srcCheckPtr = srcCopy.data();
+      }
+    }
+
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (reference[i] != srcCheckPtr[i])
+      {
+        printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
+#if !defined(__NVCC__)
+        float const val = this->srcMem[srcIdx][initOffset + i];
+        printf("[ERROR] SRC %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+               srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
+#else
+        printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
+#endif
+        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
+        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
+               this->transferIndex,
+               this->SrcToStr().c_str(),
+               ExeTypeStr[this->exeType], this->exeIndex,
+               this->numSubExecs,
+               this->DstToStr().c_str());
+        if (!ev.continueOnError)
+          exit(1);
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void Transfer::ValidateDst(EnvVars const& ev)
+{
+  if (this->numDsts == 0) return;
+  size_t const N = this->numBytesActual / sizeof(float);
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  std::vector<float> reference(N);
+  PrepareReference(ev, reference, -1);
+
+  std::vector<float> hostBuffer(N);
+  for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
+  {
+    float* output;
+    if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
+    {
+      output = this->dstMem[dstIdx] + initOffset;
+    }
+    else
+    {
+      int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
+      HIP_CALL(hipSetDevice(deviceIdx));
+      HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
+      HIP_CALL(hipDeviceSynchronize());
+      output = hostBuffer.data();
+    }
+
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (reference[i] != output[i])
+      {
+        printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
+        for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
+        {
+          float srcVal;
+          HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
+#if !defined(__NVCC__)
+          float val = this->srcMem[srcIdx][initOffset + i];
+          printf("[ERROR] SRC %02dD  value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+                 srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
+#else
+          printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
+#endif
+        }
+        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
+#if !defined(__NVCC__)
+        float dstVal = this->dstMem[dstIdx][initOffset + i];
+        printf("[ERROR] DST %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+               dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
+#else
+        printf("[ERROR] DST %02d   value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
+#endif
+        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
+               this->transferIndex,
+               this->SrcToStr().c_str(),
+               ExeTypeStr[this->exeType], this->exeIndex,
+               this->numSubExecs,
+               this->DstToStr().c_str());
+        if (!ev.continueOnError)
+          exit(1);
+        else
+          break;
+      }
+    }
+  }
+}
+
+std::string Transfer::SrcToStr() const
+{
+  if (numSrcs == 0) return "N";
+  std::stringstream ss;
+  for (int i = 0; i < numSrcs; ++i)
+    ss << MemTypeStr[srcType[i]] << srcIndex[i];
+  return ss.str();
+}
+
+std::string Transfer::DstToStr() const
+{
+  if (numDsts == 0) return "N";
+  std::stringstream ss;
+  for (int i = 0; i < numDsts; ++i)
+    ss << MemTypeStr[dstType[i]] << dstIndex[i];
+  return ss.str();
+}
+
+// NOTE: This is a stop-gap solution until HIP provides wallclock values
+int GetWallClockRate(int deviceId)
+{
+  static std::vector<int> wallClockPerDeviceMhz;
+
+  if (wallClockPerDeviceMhz.size() == 0)
+  {
+    int numGpuDevices;
+    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+    wallClockPerDeviceMhz.resize(numGpuDevices);
+
+    for (int i = 0; i < numGpuDevices; i++)
+    {
+#if defined(__NVCC__)
+      int value = 1410000;
+      //HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
+      //value *= 1000;
+#else
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, i));
+      int value = 25000;
+      switch (prop.gcnArch)
+      {
+      case 906: case 910: value = 25000; break;
+      default:
+        printf("Unrecognized GCN arch %d\n", prop.gcnArch);
+      }
+#endif
+      wallClockPerDeviceMhz[i] = value;
+    }
+  }
+  return wallClockPerDeviceMhz[deviceId];
+}
+
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
+{
+  ev.DisplaySweepEnvVars();
+
+  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
+  std::vector<std::pair<ExeType, int>> exeList;
+  for (auto exe : ev.sweepExe)
+  {
+    ExeType const exeType = CharToExeType(exe);
+    if (IsGpuType(exeType))
+    {
+      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
+        exeList.push_back(std::make_pair(exeType, exeIndex));
+    }
+    else if (IsCpuType(exeType))
+    {
+      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
+      {
+        // Skip NUMA nodes that have no CPUs (e.g. CXL)
+        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
+        exeList.push_back(std::make_pair(exeType, exeIndex));
+      }
+    }
+  }
+  int numExes = exeList.size();
+
+  std::vector<std::pair<MemType, int>> srcList;
+  for (auto src : ev.sweepSrc)
+  {
+    MemType const srcType = CharToMemType(src);
+    int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
+
+    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
+      srcList.push_back(std::make_pair(srcType, srcIndex));
+  }
+  int numSrcs = srcList.size();
+
+
+  std::vector<std::pair<MemType, int>> dstList;
+  for (auto dst : ev.sweepDst)
+  {
+    MemType const dstType = CharToMemType(dst);
+    int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
+
+    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
+      dstList.push_back(std::make_pair(dstType, dstIndex));
+  }
+  int numDsts = dstList.size();
+
+  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
+  struct TransferInfo
+  {
+    MemType srcType; int srcIndex;
+    ExeType exeType; int exeIndex;
+    MemType dstType; int dstIndex;
+  };
+
+  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
+  bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
+
+  std::vector<TransferInfo> possibleTransfers;
+  TransferInfo tinfo;
+  for (int i = 0; i < numExes; ++i)
+  {
+    // Skip CPU executors if XGMI link must be used
+    if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
+    tinfo.exeType  = exeList[i].first;
+    tinfo.exeIndex = exeList[i].second;
+
+    bool isXgmiSrc  = false;
+    int  numHopsSrc = 0;
+    for (int j = 0; j < numSrcs; ++j)
+    {
+      if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
+      {
+        if (exeList[i].second != srcList[j].second)
+        {
+#if defined(__NVCC__)
+          isXgmiSrc = false;
+#else
+          uint32_t exeToSrcLinkType, exeToSrcHopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
+                                                RemappedIndex(srcList[j].second, false),
+                                                &exeToSrcLinkType,
+                                                &exeToSrcHopCount));
+          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
+#endif
+        }
+        else
+        {
+          isXgmiSrc = true;
+          numHopsSrc = 0;
+        }
+
+        // Skip this SRC if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiSrc) continue;
+
+        // Skip this SRC if XGMI distance is already past limit
+        if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
+      }
+      else if (useXgmiOnly) continue;
+
+      tinfo.srcType  = srcList[j].first;
+      tinfo.srcIndex = srcList[j].second;
+
+      bool isXgmiDst = false;
+      int  numHopsDst = 0;
+      for (int k = 0; k < numDsts; ++k)
+      {
+        if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
+        {
+          if (exeList[i].second != dstList[k].second)
+          {
+#if defined(__NVCC__)
+            isXgmiSrc = false;
+#else
+            uint32_t exeToDstLinkType, exeToDstHopCount;
+            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
+                                                  RemappedIndex(dstList[k].second, false),
+                                                  &exeToDstLinkType,
+                                                  &exeToDstHopCount));
+            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+            if (isXgmiDst) numHopsDst = exeToDstHopCount;
+#endif
+          }
+          else
+          {
+            isXgmiDst = true;
+            numHopsDst = 0;
+          }
+        }
+
+        // Skip this DST if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiDst) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
+        if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
+        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
+
+#if defined(__NVCC__)
+        // Skip CPU executors on GPU memory on NVIDIA platform
+        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
+          continue;
+#endif
+
+        tinfo.dstType  = dstList[k].first;
+        tinfo.dstIndex = dstList[k].second;
+
+        possibleTransfers.push_back(tinfo);
+      }
+    }
+  }
+
+  int const numPossible = (int)possibleTransfers.size();
+  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
+
+  if (ev.sweepMin > numPossible)
+  {
+    printf("No valid test configurations exist\n");
+    return;
+  }
+
+  if (ev.outputToCsv)
+  {
+    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
+           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
+  }
+
+  int numTestsRun = 0;
+  int M = ev.sweepMin;
+  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
+  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
+
+  // Log sweep to configuration file
+  FILE *fp = fopen("lastSweep.cfg", "w");
+  if (!fp)
+  {
+    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
+    exit(1);
+  }
+
+  // Create bitmask of numPossible triplets, of which M will be chosen
+  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
+  auto cpuStart = std::chrono::high_resolution_clock::now();
+  while (1)
+  {
+    if (isRandom)
+    {
+      // Pick random number of simultaneous transfers to execute
+      // NOTE: This currently skews distribution due to some #s having more possibilities than others
+      M = distribution(*ev.generator);
+
+      // Generate a random bitmask
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+      std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
+    }
+
+    // Convert bitmask to list of Transfers
+    std::vector<Transfer> transfers;
+    for (int value = 0; value < numPossible; ++value)
+    {
+      if (bitmask[value])
+      {
+        // Convert integer value to (SRC->EXE->DST) triplet
+        Transfer transfer;
+        transfer.numSrcs        = 1;
+        transfer.numDsts        = 1;
+        transfer.srcType        = {possibleTransfers[value].srcType};
+        transfer.srcIndex       = {possibleTransfers[value].srcIndex};
+        transfer.exeType        = possibleTransfers[value].exeType;
+        transfer.exeIndex       = possibleTransfers[value].exeIndex;
+        transfer.dstType        = {possibleTransfers[value].dstType};
+        transfer.dstIndex       = {possibleTransfers[value].dstIndex};
+        transfer.numSubExecs    = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
+        transfer.transferIndex  = transfers.size();
+        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
+        transfers.push_back(transfer);
+      }
+    }
+
+    LogTransfers(fp, ++numTestsRun, transfers);
+    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
+
+    // Check for test limit
+    if (numTestsRun == ev.sweepTestLimit)
+    {
+      printf("Test limit reached\n");
+      break;
+    }
+
+    // Check for time limit
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
+    {
+      printf("Time limit exceeded\n");
+      break;
+    }
+
+    // Increment bitmask if not random sweep
+    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
+    {
+      M++;
+      // Check for completion
+      if (M > maxParallelTransfers)
+      {
+        printf("Sweep complete\n");
+        break;
+      }
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+    }
+  }
+  fclose(fp);
+}
+
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
+{
+  fprintf(fp, "# Test %d\n", testNum);
+  fprintf(fp, "%d", -1 * (int)transfers.size());
+  for (auto const& transfer : transfers)
+  {
+    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
+            MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
+            ExeTypeStr[transfer.exeType],    transfer.exeIndex,
+            MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
+            transfer.numSubExecs,
+            transfer.numBytes);
+  }
+  fprintf(fp, "\n");
+  fflush(fp);
+}
+
+std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
+{
+  std::stringstream ss;
+  for (int i = 0; i < strVector.size(); ++i)
+  {
+    if (i) ss << " ";
+    ss << (strVector[i] + initOffset);
+  }
+  return ss.str();
+}
diff --git a/transferbench.so/src/action.cpp b/transferbench.so/src/action.cpp
index 50c721c5..4b152d56 100644
--- a/transferbench.so/src/action.cpp
+++ b/transferbench.so/src/action.cpp
@@ -75,7 +75,7 @@ bool tfb_action::start_tfb_runners() {
     // one worker sufficient, as test runner
     hipTestWorker worker;
     worker.set_name(action_name);
-    worker.set_path(m_transfer_type);
+    worker.set_transfer_type(m_transfer_type);
     worker.start();
     worker.join();
 
diff --git a/transferbench.so/src/tfb_worker.cpp b/transferbench.so/src/tfb_worker.cpp
index 75b926d9..a5f21993 100644
--- a/transferbench.so/src/tfb_worker.cpp
+++ b/transferbench.so/src/tfb_worker.cpp
@@ -27,7 +27,7 @@ THE SOFTWARE.
 #include <random>
 #include <stack>
 #include <thread>
-
+#include "tfb_worker.h"
 #include "TransferBench.hpp"
 #include "GetClosestNumaNode.hpp"
 
@@ -59,7 +59,7 @@ void tfbWorker::run() {
 }
 
 
-int TfbRun(int &err, string &errmsg)
+int tfbWorker::TfbRun(int &err, string &errmsg)
 {
   // Check for NUMA library support
   if (numa_available() == -1)
@@ -68,20 +68,14 @@ int TfbRun(int &err, string &errmsg)
     exit(1);
   }
 
-  // Display usage instructions and detected topology
-  if (argc <= 1)
-  {
-    int const outputToCsv = EnvVars::GetEnvVar("OUTPUT_TO_CSV", 0);
-    if (!outputToCsv) DisplayUsage(argv[0]);
-    DisplayTopology(outputToCsv);
-    exit(0);
-  }
 
   // Collect environment variables / display current run configuration
   EnvVars ev;
 
   // Determine number of bytes to run per Transfer
-  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
+  //size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
+  size_t numBytesPerTransfer = DEFAULT_BYTES_PER_TRANSFER*1024*1024;
+  /*
   if (argc > 2)
   {
     // Adjust bytes if unit specified
@@ -92,1862 +86,18 @@ int TfbRun(int &err, string &errmsg)
     case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
     case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
     }
-  }
+  }*/
   if (numBytesPerTransfer % 4)
   {
     printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
     exit(1);
   }
 
-  // Check for preset tests
-  // - Tests that sweep across possible sets of Transfers
-  if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
-  {
-    int numGpuSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
-    int numCpuSubExecs = (argc > 4 ? atoi(argv[4]) : 4);
-
-    ev.configMode = CFG_SWEEP;
-    RunSweepPreset(ev, numBytesPerTransfer, numGpuSubExecs, numCpuSubExecs, !strcmp(argv[1], "rsweep"));
-    exit(0);
-  }
-  // - Tests that benchmark peer-to-peer performance
-  else if (!strcmp(argv[1], "p2p"))
-  {
     ev.configMode = CFG_P2P;
     RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float));
-    exit(0);
-  }
-
-  // Check that Transfer configuration file can be opened
-  ev.configMode = CFG_FILE;
-  FILE* fp = fopen(argv[1], "r");
-  if (!fp)
-  {
-    printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
-    exit(1);
-  }
-
-  // Print environment variables and CSV header
-  ev.DisplayEnvVars();
-  if (ev.outputToCsv)
-  {
-    printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
-  }
 
-  int testNum = 0;
-  char line[2048];
-  while(fgets(line, 2048, fp))
-  {
-    // Check if line is a comment to be echoed to output (starts with ##)
-    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);
-
-    // Parse set of parallel Transfers to execute
-    std::vector<Transfer> transfers;
-    ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
-    if (transfers.empty()) continue;
-
-    // If the number of bytes is specified, use it
-    if (numBytesPerTransfer != 0)
-    {
-      size_t N = numBytesPerTransfer / sizeof(float);
-      ExecuteTransfers(ev, ++testNum, N, transfers);
-    }
-    else
-    {
-      // Otherwise generate a range of values
-      for (int N = 256; N <= (1<<27); N *= 2)
-      {
-        int delta = std::max(1, N / ev.samplingFactor);
-        int curr = N;
-        while (curr < N * 2)
-        {
-          ExecuteTransfers(ev, ++testNum, curr, transfers);
-          curr += delta;
-        }
-      }
-    }
-  }
-  fclose(fp);
 
   return 0;
 }
 
-void ExecuteTransfers(EnvVars const& ev,
-                      int const testNum,
-                      size_t const N,
-                      std::vector<Transfer>& transfers,
-                      bool verbose)
-{
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  // Map transfers by executor
-  TransferMap transferMap;
-  for (Transfer& transfer : transfers)
-  {
-    Executor executor(transfer.exeType, transfer.exeIndex);
-    ExecutorInfo& executorInfo = transferMap[executor];
-    executorInfo.transfers.push_back(&transfer);
-  }
-
-  // Loop over each executor and prepare sub-executors
-  std::map<int, Transfer*> transferList;
-  for (auto& exeInfoPair : transferMap)
-  {
-    Executor const& executor = exeInfoPair.first;
-    ExecutorInfo& exeInfo    = exeInfoPair.second;
-    ExeType const exeType    = executor.first;
-    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
-
-    exeInfo.totalTime = 0.0;
-    exeInfo.totalSubExecs = 0;
-
-    // Loop over each transfer this executor is involved in
-    for (Transfer* transfer : exeInfo.transfers)
-    {
-      // Determine how many bytes to copy for this Transfer (use custom if pre-specified)
-      transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));
-
-      // Allocate source memory
-      transfer->srcMem.resize(transfer->numSrcs);
-      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
-      {
-        MemType const& srcType  = transfer->srcType[iSrc];
-        int     const  srcIndex    = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
-
-        // Ensure executing GPU can access source memory
-        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex)
-          EnablePeerAccess(exeIndex, srcIndex);
-
-        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
-      }
-
-      // Allocate destination memory
-      transfer->dstMem.resize(transfer->numDsts);
-      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
-      {
-        MemType const& dstType  = transfer->dstType[iDst];
-        int     const  dstIndex    = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
-
-        // Ensure executing GPU can access destination memory
-        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex)
-          EnablePeerAccess(exeIndex, dstIndex);
-
-        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
-      }
-
-      exeInfo.totalSubExecs += transfer->numSubExecs;
-      transferList[transfer->transferIndex] = transfer;
-    }
-
-    // Prepare additional requirement for GPU-based executors
-    if (IsGpuType(exeType))
-    {
-      HIP_CALL(hipSetDevice(exeIndex));
-
-      // Single-stream is only supported for GFX-based executors
-      int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
-      exeInfo.streams.resize(numStreamsToUse);
-      exeInfo.startEvents.resize(numStreamsToUse);
-      exeInfo.stopEvents.resize(numStreamsToUse);
-      for (int i = 0; i < numStreamsToUse; ++i)
-      {
-        HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
-        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
-        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
-      }
-
-      if (exeType == EXE_GPU_GFX)
-      {
-        // Allocate one contiguous chunk of GPU memory for threadblock parameters
-        // This allows support for executing one transfer per stream, or all transfers in a single stream
-        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
-                       (void**)&exeInfo.subExecParamGpu);
-      }
-    }
-  }
-
-  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
-
-  // Prepare input memory and block parameters for current N
-  bool isSrcCorrect = true;
-  for (auto& exeInfoPair : transferMap)
-  {
-    Executor const& executor = exeInfoPair.first;
-    ExecutorInfo& exeInfo    = exeInfoPair.second;
-    ExeType const exeType    = executor.first;
-    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
-
-    exeInfo.totalBytes = 0;
-
-    int transferOffset = 0;
-    for (int i = 0; i < exeInfo.transfers.size(); ++i)
-    {
-      // Prepare subarrays each threadblock works on and fill src memory with patterned data
-      Transfer* transfer = exeInfo.transfers[i];
-      transfer->PrepareSubExecParams(ev);
-      isSrcCorrect &= transfer->PrepareSrc(ev);
-      exeInfo.totalBytes += transfer->numBytesActual;
-
-      // Copy block parameters to GPU for GPU executors
-      if (transfer->exeType == EXE_GPU_GFX)
-      {
-        exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
-        HIP_CALL(hipSetDevice(exeIndex));
-        HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset],
-                           transfer->subExecParam.data(),
-                           transfer->subExecParam.size() * sizeof(SubExecParam),
-                           hipMemcpyHostToDevice));
-        HIP_CALL(hipDeviceSynchronize());
-
-        transferOffset += transfer->subExecParam.size();
-      }
-    }
-  }
-
-  // Launch kernels (warmup iterations are not counted)
-  double totalCpuTime = 0;
-  size_t numTimedIterations = 0;
-  std::stack<std::thread> threads;
-  for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
-  {
-    if (ev.numIterations > 0 && iteration    >= ev.numIterations) break;
-    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
-
-    // Pause before starting first timed iteration in interactive mode
-    if (verbose && ev.useInteractive && iteration == 0)
-    {
-      printf("Memory prepared:\n");
-
-      for (Transfer& transfer : transfers)
-      {
-        printf("Transfer %03d:\n", transfer.transferIndex);
-        for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
-          printf("  SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
-        for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
-          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
-      }
-      printf("Hit <Enter> to continue: ");
-      if (scanf("%*c") != 0)
-      {
-        printf("[ERROR] Unexpected input\n");
-        exit(1);
-      }
-      printf("\n");
-    }
-
-    // Start CPU timing for this iteration
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-
-    // Execute all Transfers in parallel
-    for (auto& exeInfoPair : transferMap)
-    {
-      ExecutorInfo& exeInfo = exeInfoPair.second;
-      ExeType       exeType = exeInfoPair.first.first;
-      int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();
-
-      for (int i = 0; i < numTransfersToRun; ++i)
-        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
-    }
-
-    // Wait for all threads to finish
-    int const numTransfers = threads.size();
-    for (int i = 0; i < numTransfers; i++)
-    {
-      threads.top().join();
-      threads.pop();
-    }
-
-    // Stop CPU timing for this iteration
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-
-    if (iteration >= 0)
-    {
-      ++numTimedIterations;
-      totalCpuTime += deltaSec;
-    }
-  }
-
-  // Pause for interactive mode
-  if (verbose && isSrcCorrect && ev.useInteractive)
-  {
-    printf("Transfers complete. Hit <Enter> to continue: ");
-    if (scanf("%*c") != 0)
-    {
-      printf("[ERROR] Unexpected input\n");
-      exit(1);
-    }
-    printf("\n");
-  }
-
-  // Validate that each transfer has transferred correctly
-  size_t totalBytesTransferred = 0;
-  int const numTransfers = transferList.size();
-  for (auto transferPair : transferList)
-  {
-    Transfer* transfer = transferPair.second;
-    transfer->ValidateDst(ev);
-    totalBytesTransferred += transfer->numBytesActual;
-  }
-
-  // Report timings
-  totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
-  double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
-  double maxGpuTime = 0;
-
-  if (!isSrcCorrect) goto cleanup;
-  if (ev.useSingleStream)
-  {
-    for (auto& exeInfoPair : transferMap)
-    {
-      ExecutorInfo  exeInfo  = exeInfoPair.second;
-      ExeType const exeType  = exeInfoPair.first.first;
-      int     const exeIndex = exeInfoPair.first.second;
-
-      // Compute total time for non GPU executors
-      if (exeType != EXE_GPU_GFX)
-      {
-        exeInfo.totalTime = 0;
-        for (auto const& transfer : exeInfo.transfers)
-          exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
-      }
-
-      double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
-      double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
-      maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
-
-      if (verbose && !ev.outputToCsv)
-      {
-        printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n",
-               ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
-      }
-
-      int totalCUs = 0;
-      for (auto const& transfer : exeInfo.transfers)
-      {
-        double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
-        double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
-        totalCUs += transfer->numSubExecs;
-
-        if (!verbose) continue;
-        if (!ev.outputToCsv)
-        {
-          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
-                 transfer->transferIndex,
-                 transferBandwidthGbs,
-                 transferDurationMsec,
-                 transfer->numBytesActual,
-                 transfer->SrcToStr().c_str(),
-                 ExeTypeName[transfer->exeType], transfer->exeIndex,
-                 transfer->numSubExecs,
-                 transfer->DstToStr().c_str());
-        }
-        else
-        {
-          printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n",
-                 testNum, transfer->transferIndex, transfer->numBytesActual,
-                 transfer->SrcToStr().c_str(),
-                 MemTypeStr[transfer->exeType], transfer->exeIndex,
-                 transfer->DstToStr().c_str(),
-                 transfer->numSubExecs,
-                 transferBandwidthGbs, transferDurationMsec,
-                 PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
-                 PtrVectorToStr(transfer->dstMem, initOffset).c_str());
-        }
-      }
-
-      if (verbose && ev.outputToCsv)
-      {
-        printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n",
-               testNum, totalBytesTransferred,
-               MemTypeStr[exeType], exeIndex, totalCUs,
-               exeBandwidthGbs, exeDurationMsec);
-      }
-    }
-  }
-  else
-  {
-    for (auto const& transferPair : transferList)
-    {
-      Transfer* transfer = transferPair.second;
-      double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
-      double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
-      maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
-      if (!verbose) continue;
-      if (!ev.outputToCsv)
-      {
-        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
-               transfer->transferIndex,
-               transferBandwidthGbs, transferDurationMsec,
-               transfer->numBytesActual,
-               transfer->SrcToStr().c_str(),
-               ExeTypeName[transfer->exeType], transfer->exeIndex,
-               transfer->numSubExecs,
-               transfer->DstToStr().c_str());
-      }
-      else
-      {
-        printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n",
-               testNum, transfer->transferIndex, transfer->numBytesActual,
-               transfer->SrcToStr().c_str(),
-               ExeTypeName[transfer->exeType], transfer->exeIndex,
-               transfer->DstToStr().c_str(),
-               transfer->numSubExecs,
-               transferBandwidthGbs, transferDurationMsec,
-               PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
-               PtrVectorToStr(transfer->dstMem, initOffset).c_str());
-      }
-    }
-  }
-
-  // Display aggregate statistics
-  if (verbose)
-  {
-    if (!ev.outputToCsv)
-    {
-      printf(" Aggregate (CPU)  | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
-             totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
-    }
-    else
-    {
-      printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n",
-             testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
-    }
-  }
-
-  // Release GPU memory
-cleanup:
-  for (auto exeInfoPair : transferMap)
-  {
-    ExecutorInfo& exeInfo  = exeInfoPair.second;
-    ExeType const exeType  = exeInfoPair.first.first;
-    int     const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));
-
-    for (auto& transfer : exeInfo.transfers)
-    {
-      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
-      {
-        MemType const& srcType = transfer->srcType[iSrc];
-        DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
-      }
-      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
-      {
-        MemType const& dstType = transfer->dstType[iDst];
-        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
-      }
-      transfer->subExecParam.clear();
-    }
-
-    if (IsGpuType(exeType))
-    {
-      int const numStreams = (int)exeInfo.streams.size();
-      for (int i = 0; i < numStreams; ++i)
-      {
-        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
-        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
-        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
-      }
-
-      if (exeType == EXE_GPU_GFX)
-      {
-        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
-      }
-    }
-  }
-}
-
-void DisplayUsage(char const* cmdName)
-{
-  printf("TransferBench v%s\n", TB_VERSION);
-  printf("========================================\n");
-
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  int const numCpuDevices = numa_num_configured_nodes();
-
-  printf("Usage: %s config <N>\n", cmdName);
-  printf("  config: Either:\n");
-  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
-  printf("          - Name of preset config:\n");
-  printf("              p2p          - Peer-to-peer benchmark tests\n");
-  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
-  printf("                             - 3rd/4th optional args for # GPU SubExecs / # CPU SubExecs per Transfer\n");
-  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
-  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
-         DEFAULT_BYTES_PER_TRANSFER);
-  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
-  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
-  printf("\n");
-
-  EnvVars::DisplayUsage();
-}
-
-int RemappedIndex(int const origIdx, bool const isCpuType)
-{
-  static std::vector<int> remappingCpu;
-  static std::vector<int> remappingGpu;
-
-  // Build CPU remapping on first use
-  // Skip numa nodes that are not configured
-  if (remappingCpu.empty())
-  {
-    for (int node = 0; node <= numa_max_node(); node++)
-      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
-        remappingCpu.push_back(node);
-  }
-
-  // Build remappingGpu on first use
-  if (remappingGpu.empty())
-  {
-    int numGpuDevices;
-    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-    remappingGpu.resize(numGpuDevices);
-
-    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
-    if (!usePcieIndexing)
-    {
-      // For HIP-based indexing no remappingGpu is necessary
-      for (int i = 0; i < numGpuDevices; ++i)
-        remappingGpu[i] = i;
-    }
-    else
-    {
-      // Collect PCIe address for each GPU
-      std::vector<std::pair<std::string, int>> mapping;
-      char pciBusId[20];
-      for (int i = 0; i < numGpuDevices; ++i)
-      {
-        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
-        mapping.push_back(std::make_pair(pciBusId, i));
-      }
-      // Sort GPUs by PCIe address then use that as mapping
-      std::sort(mapping.begin(), mapping.end());
-      for (int i = 0; i < numGpuDevices; ++i)
-        remappingGpu[i] = mapping[i].second;
-    }
-  }
-  return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
-}
-
-void DisplayTopology(bool const outputToCsv)
-{
-
-  int numCpuDevices = numa_num_configured_nodes();
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-
-  if (outputToCsv)
-  {
-    printf("NumCpus,%d\n", numCpuDevices);
-    printf("NumGpus,%d\n", numGpuDevices);
-  }
-  else
-  {
-    printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total]   %d GPU device(s)\n",
-           numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
-  }
-
-  // Print out detected CPU topology
-  if (outputToCsv)
-  {
-    printf("NUMA");
-    for (int j = 0; j < numCpuDevices; j++)
-      printf(",NUMA%02d", j);
-    printf(",# CPUs,ClosestGPUs,ActualNode\n");
-  }
-  else
-  {
-    printf("            |");
-    for (int j = 0; j < numCpuDevices; j++)
-      printf("NUMA %02d|", j);
-    printf(" #Cpus | Closest GPU(s)\n");
-
-    printf("------------+");
-    for (int j = 0; j <= numCpuDevices; j++)
-      printf("-------+");
-    printf("---------------\n");
-  }
-
-  for (int i = 0; i < numCpuDevices; i++)
-  {
-    int nodeI = RemappedIndex(i, true);
-    printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
-    for (int j = 0; j < numCpuDevices; j++)
-    {
-      int nodeJ = RemappedIndex(j, true);
-      int numaDist = numa_distance(nodeI, nodeJ);
-      if (outputToCsv)
-        printf("%d,", numaDist);
-      else
-        printf(" %5d |", numaDist);
-    }
-
-    int numCpus = 0;
-    for (int j = 0; j < numa_num_configured_cpus(); j++)
-      if (numa_node_of_cpu(j) == nodeI) numCpus++;
-    if (outputToCsv)
-      printf("%d,", numCpus);
-    else
-      printf(" %5d | ", numCpus);
-
-#if !defined(__NVCC__)
-    bool isFirst = true;
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
-      {
-        if (isFirst) isFirst = false;
-        else printf(",");
-        printf("%d", j);
-      }
-    }
-#endif
-    printf("\n");
-  }
-  printf("\n");
-
-#if defined(__NVCC__)
-  // No further topology detection done for NVIDIA platforms
-  return;
-#endif
-
-  // Print out detected GPU topology
-  if (outputToCsv)
-  {
-    printf("GPU");
-    for (int j = 0; j < numGpuDevices; j++)
-      printf(",GPU %02d", j);
-    printf(",PCIe Bus ID,ClosestNUMA\n");
-  }
-  else
-  {
-    printf("        |");
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, j));
-      std::string fullName = prop.gcnArchName;
-      std::string archName = fullName.substr(0, fullName.find(':'));
-      printf(" %6s |", archName.c_str());
-    }
-    printf("\n");
-    printf("        |");
-    for (int j = 0; j < numGpuDevices; j++)
-      printf(" GPU %02d |", j);
-    printf(" PCIe Bus ID  | #CUs | Closest NUMA\n");
-    for (int j = 0; j <= numGpuDevices; j++)
-      printf("--------+");
-    printf("--------------+------+-------------\n");
-  }
-
-#if !defined(__NVCC__)
-  char pciBusId[20];
-  for (int i = 0; i < numGpuDevices; i++)
-  {
-    int const deviceIdx = RemappedIndex(i, false);
-    printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      if (i == j)
-      {
-        if (outputToCsv)
-          printf("-,");
-        else
-          printf("    -   |");
-      }
-      else
-      {
-        uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
-                                              RemappedIndex(j, false),
-                                              &linkType, &hopCount));
-        printf("%s%s-%d%s",
-               outputToCsv ? "" : " ",
-               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
-               hopCount, outputToCsv ? "," : " |");
-      }
-    }
-    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));
-
-    int numDeviceCUs = 0;
-    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
-
-    if (outputToCsv)
-      printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
-    else
-      printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
-  }
-#endif
-}
-
-void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
-                  std::vector<MemType>& memTypes, std::vector<int>& memIndices)
-{
-  char typeChar;
-  int offset = 0, devIndex, inc;
-  bool found = false;
-
-  memTypes.clear();
-  memIndices.clear();
-  while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
-  {
-    offset += inc;
-    MemType memType = CharToMemType(typeChar);
-
-    if (IsCpuType(memType) && (devIndex < 0 || devIndex >= numCpus))
-    {
-      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, devIndex);
-      exit(1);
-    }
-    if (IsGpuType(memType) && (devIndex < 0 || devIndex >= numGpus))
-    {
-      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, devIndex);
-      exit(1);
-    }
-
-    found = true;
-    if (memType != MEM_NULL)
-    {
-      memTypes.push_back(memType);
-      memIndices.push_back(devIndex);
-    }
-  }
-  if (!found)
-  {
-    printf("[ERROR] Unable to parse memory type token %s.  Expected one of %s followed by an index\n",
-           token.c_str(), MemTypeStr);
-    exit(1);
-  }
-}
-
-void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
-                  ExeType &exeType, int& exeIndex)
-{
-  char typeChar;
-  if (sscanf(token.c_str(), " %c%d", &typeChar, &exeIndex) != 2)
-  {
-    printf("[ERROR] Unable to parse valid executor token (%s).  Exepected one of %s followed by an index\n",
-           token.c_str(), ExeTypeStr);
-    exit(1);
-  }
-  exeType = CharToExeType(typeChar);
-
-  if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= numCpus))
-  {
-    printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, exeIndex);
-    exit(1);
-  }
-  if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= numGpus))
-  {
-    printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, exeIndex);
-    exit(1);
-  }
-}
-
-// Helper function to parse a list of Transfer definitions
-void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
-{
-  // Replace any round brackets or '->' with spaces,
-  for (int i = 1; line[i]; i++)
-    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
-
-  transfers.clear();
-
-  int numTransfers = 0;
-  std::istringstream iss(line);
-  iss >> numTransfers;
-  if (iss.fail()) return;
-
-  std::string exeMem;
-  std::string srcMem;
-  std::string dstMem;
-
-  // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
-  // otherwise read triples (srcMem, exeMem, dstMem)
-  bool const advancedMode = (numTransfers < 0);
-  numTransfers = abs(numTransfers);
-
-  int numSubExecs;
-  if (!advancedMode)
-  {
-    iss >> numSubExecs;
-    if (numSubExecs <= 0 || iss.fail())
-    {
-      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs);
-      exit(1);
-    }
-  }
-
-  size_t numBytes = 0;
-  for (int i = 0; i < numTransfers; i++)
-  {
-    Transfer transfer;
-    transfer.transferIndex = i;
-    transfer.numBytes = 0;
-    transfer.numBytesActual = 0;
-    if (!advancedMode)
-    {
-      iss >> srcMem >> exeMem >> dstMem;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
-        exit(1);
-      }
-    }
-    else
-    {
-      std::string numBytesToken;
-      iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
-        exit(1);
-      }
-      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
-      {
-        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
-        exit(1);
-      }
-      char units = numBytesToken.back();
-      switch (toupper(units))
-      {
-      case 'K': numBytes *= 1024; break;
-      case 'M': numBytes *= 1024*1024; break;
-      case 'G': numBytes *= 1024*1024*1024; break;
-      }
-    }
-
-    ParseMemType(srcMem, numCpus, numGpus, transfer.srcType, transfer.srcIndex);
-    ParseMemType(dstMem, numCpus, numGpus, transfer.dstType, transfer.dstIndex);
-    ParseExeType(exeMem, numCpus, numGpus, transfer.exeType, transfer.exeIndex);
-
-    transfer.numSrcs = (int)transfer.srcType.size();
-    transfer.numDsts = (int)transfer.dstType.size();
-    if (transfer.numSrcs == 0 && transfer.numDsts == 0)
-    {
-      printf("[ERROR] Transfer must have at least one src or dst\n");
-      exit(1);
-    }
-
-    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
-    {
-      printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
-      exit(1);
-    }
-
-    transfer.numSubExecs = numSubExecs;
-    transfer.numBytes = numBytes;
-    transfers.push_back(transfer);
-  }
-}
-
-void EnablePeerAccess(int const deviceId, int const peerDeviceId)
-{
-  int canAccess;
-  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
-  if (!canAccess)
-  {
-    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
-    exit(1);
-  }
-  HIP_CALL(hipSetDevice(deviceId));
-  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
-  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
-  {
-    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
-           deviceId, peerDeviceId, hipGetErrorString(error));
-    exit(1);
-  }
-}
 
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
-{
-  if (numBytes == 0)
-  {
-    printf("[ERROR] Unable to allocate 0 bytes\n");
-    exit(1);
-  }
-  *memPtr = nullptr;
-  if (IsCpuType(memType))
-  {
-    // Set numa policy prior to call to hipHostMalloc
-    numa_set_preferred(devIndex);
-
-    // Allocate host-pinned memory (should respect NUMA mem policy)
-    if (memType == MEM_CPU_FINE)
-    {
-#if defined (__NVCC__)
-      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
-      exit(1);
-#else
-      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
-#endif
-    }
-    else if (memType == MEM_CPU)
-    {
-#if defined (__NVCC__)
-      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
-#else
-      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
-#endif
-      {
-        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
-        exit(1);
-      }
-    }
-    else if (memType == MEM_CPU_UNPINNED)
-    {
-      *memPtr = numa_alloc_onnode(numBytes, devIndex);
-    }
-
-    // Check that the allocated pages are actually on the correct NUMA node
-    memset(*memPtr, 0, numBytes);
-    CheckPages((char*)*memPtr, numBytes, devIndex);
-
-    // Reset to default numa mem policy
-    numa_set_preferred(-1);
-  }
-  else if (IsGpuType(memType))
-  {
-    if (memType == MEM_GPU)
-    {
-      // Allocate GPU memory on appropriate device
-      HIP_CALL(hipSetDevice(devIndex));
-      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
-    }
-    else if (memType == MEM_GPU_FINE)
-    {
-#if defined (__NVCC__)
-      printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
-      exit(1);
-#else
-      HIP_CALL(hipSetDevice(devIndex));
-      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
-
-#endif
-    }
-    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
-    HIP_CALL(hipDeviceSynchronize());
-  }
-  else
-  {
-    printf("[ERROR] Unsupported memory type %d\n", memType);
-    exit(1);
-  }
-}
-
-void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
-{
-  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null CPU pointer for %lu bytes.  Skipping hipHostFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipHostFree(memPtr));
-  }
-  else if (memType == MEM_CPU_UNPINNED)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes.  Skipping numa_free\n", bytes);
-      return;
-    }
-    numa_free(memPtr, bytes);
-  }
-  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipFree(memPtr));
-  }
-}
-
-void CheckPages(char* array, size_t numBytes, int targetId)
-{
-  unsigned long const pageSize = getpagesize();
-  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
-
-  std::vector<void *> pages(numPages);
-  std::vector<int> status(numPages);
-
-  pages[0] = array;
-  for (int i = 1; i < numPages; i++)
-  {
-    pages[i] = (char*)pages[i-1] + pageSize;
-  }
-
-  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
-  if (retCode)
-  {
-    printf("[ERROR] Unable to collect page info\n");
-    exit(1);
-  }
-
-  size_t mistakeCount = 0;
-  for (int i = 0; i < numPages; i++)
-  {
-    if (status[i] < 0)
-    {
-      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
-      exit(1);
-    }
-    if (status[i] != targetId) mistakeCount++;
-  }
-  if (mistakeCount > 0)
-  {
-    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
-    exit(1);
-  }
-}
-
-void RunTransfer(EnvVars const& ev, int const iteration,
-                 ExecutorInfo& exeInfo, int const transferIdx)
-{
-  Transfer* transfer = exeInfo.transfers[transferIdx];
-
-  if (transfer->exeType == EXE_GPU_GFX)
-  {
-    // Switch to executing GPU
-    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-    HIP_CALL(hipSetDevice(exeIndex));
-
-    hipStream_t& stream     = exeInfo.streams[transferIdx];
-    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
-    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
-
-    // Figure out how many threadblocks to use.
-    // In single stream mode, all the threadblocks for this GPU are launched
-    // Otherwise, just launch the threadblocks associated with this single Transfer
-    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
-#if defined(__NVCC__)
-    HIP_CALL(hipEventRecord(startEvent, stream));
-    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
-    HIP_CALL(hipEventRecord(stopEvent, stream));
-#else
-    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
-                          dim3(numBlocksToRun, 1, 1),
-                          dim3(BLOCKSIZE, 1, 1),
-                          ev.sharedMemBytes, stream,
-                          startEvent, stopEvent,
-                          0, transfer->subExecParamGpuPtr);
-#endif
-    // Synchronize per iteration, unless in single sync mode, in which case
-    // synchronize during last warmup / last actual iteration
-    HIP_CALL(hipStreamSynchronize(stream));
-
-    if (iteration >= 0)
-    {
-      // Record GPU timing
-      float gpuDeltaMsec;
-      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-
-      if (ev.useSingleStream)
-      {
-        // Figure out individual timings for Transfers that were all launched together
-        for (Transfer* currTransfer : exeInfo.transfers)
-        {
-          long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle;
-          long long maxStopCycle  = currTransfer->subExecParamGpuPtr[0].stopCycle;
-          for (int i = 1; i < currTransfer->numSubExecs; i++)
-          {
-            minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle);
-            maxStopCycle  = std::max(maxStopCycle,  currTransfer->subExecParamGpuPtr[i].stopCycle);
-          }
-          int const wallClockRate = GetWallClockRate(exeIndex);
-          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
-          currTransfer->transferTime += iterationTimeMs;
-        }
-        exeInfo.totalTime += gpuDeltaMsec;
-      }
-      else
-      {
-        transfer->transferTime += gpuDeltaMsec;
-      }
-    }
-  }
-  else if (transfer->exeType == EXE_GPU_DMA)
-  {
-    // Switch to executing GPU
-    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-    HIP_CALL(hipSetDevice(exeIndex));
-
-    hipStream_t& stream     = exeInfo.streams[transferIdx];
-    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
-    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
-
-    HIP_CALL(hipEventRecord(startEvent, stream));
-    if (transfer->numSrcs == 0 && transfer->numDsts == 1)
-    {
-      HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
-                              MEMSET_CHAR, transfer->numBytesActual, stream));
-    }
-    else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
-    {
-      HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
-                              transfer->numBytesActual, hipMemcpyDefault,
-                              stream));
-    }
-    HIP_CALL(hipEventRecord(stopEvent, stream));
-    HIP_CALL(hipStreamSynchronize(stream));
-
-    if (iteration >= 0)
-    {
-      // Record GPU timing
-      float gpuDeltaMsec;
-      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-      transfer->transferTime += gpuDeltaMsec;
-    }
-  }
-  else if (transfer->exeType == EXE_CPU) // CPU execution agent
-  {
-    // Force this thread and all child threads onto correct NUMA node
-    int const exeIndex = RemappedIndex(transfer->exeIndex, true);
-    if (numa_run_on_node(exeIndex))
-    {
-      printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
-      exit(1);
-    }
-
-    std::vector<std::thread> childThreads;
-
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-
-    // Launch each subExecutor in child-threads to perform memcopies
-    for (int i = 0; i < transfer->numSubExecs; ++i)
-      childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
-
-    // Wait for child-threads to finish
-    for (int i = 0; i < transfer->numSubExecs; ++i)
-      childThreads[i].join();
-
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-
-    // Record time if not a warmup iteration
-    if (iteration >= 0)
-      transfer->transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
-  }
-}
-
-void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
-{
-  ev.DisplayP2PBenchmarkEnvVars();
-
-  // Collect the number of available CPUs/GPUs on this machine
-  int const numCpus    = ev.numCpuDevices;
-  int const numGpus    = ev.numGpuDevices;
-  int const numDevices = numCpus + numGpus;
-
-  // Enable peer to peer for each GPU
-  for (int i = 0; i < numGpus; i++)
-    for (int j = 0; j < numGpus; j++)
-      if (i != j) EnablePeerAccess(i, j);
-
-  // Perform unidirectional / bidirectional
-  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
-  {
-    // Print header
-    if (!ev.outputToCsv)
-    {
-      printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
-             ev.useRemoteRead ? "Remote" : "Local",
-             ev.useRemoteRead ? "Local" : "Remote",
-             ev.useDmaCopy    ? "DMA"   : "GFX");
-
-      if (isBidirectional)
-      {
-        printf("%12s", "SRC\\DST");
-      }
-      else
-      {
-        if (ev.useRemoteRead)
-          printf("%12s", "SRC\\EXE+DST");
-        else
-          printf("%12s", "SRC+EXE\\DST");
-      }
-      for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i);
-      for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i);
-      printf("\n");
-    }
-
-    // Loop over all possible src/dst pairs
-    for (int src = 0; src < numDevices; src++)
-    {
-      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
-      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
-
-      if (!ev.outputToCsv)
-        printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
-
-      for (int dst = 0; dst < numDevices; dst++)
-      {
-        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
-        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
-
-        double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex);
-        if (!ev.outputToCsv)
-        {
-          if (bandwidth == 0)
-            printf("%10s", "N/A");
-          else
-            printf("%10.2f", bandwidth);
-        }
-        else
-        {
-          printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n",
-                 srcType == MEM_CPU ? "CPU" : "GPU", srcIndex,
-                 dstType == MEM_CPU ? "CPU" : "GPU", dstIndex,
-                 isBidirectional ? "bidirectional" : "unidirectional",
-                 ev.useRemoteRead ? "Remote" : "Local",
-                 ev.useDmaCopy ? "DMA" : "GFX",
-                 bandwidth,
-                 N * sizeof(float));
-        }
-        fflush(stdout);
-      }
-      if (!ev.outputToCsv) printf("\n");
-    }
-    if (!ev.outputToCsv) printf("\n");
-  }
-}
-
-double GetPeakBandwidth(EnvVars const& ev, size_t const N,
-                        int     const  isBidirectional,
-                        MemType const  srcType, int const srcIndex,
-                        MemType const  dstType, int const dstIndex)
-{
-  // Skip bidirectional on same device
-  if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
-
-  // Prepare Transfers
-  std::vector<Transfer> transfers(2);
-  transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
-
-  // SRC -> DST
-  transfers[0].numSrcs = transfers[0].numDsts = 1;
-  transfers[0].srcType.push_back(srcType);
-  transfers[0].dstType.push_back(dstType);
-  transfers[0].srcIndex.push_back(srcIndex);
-  transfers[0].dstIndex.push_back(dstIndex);
-
-  // DST -> SRC
-  transfers[1].numSrcs = transfers[1].numDsts = 1;
-  transfers[1].srcType.push_back(dstType);
-  transfers[1].dstType.push_back(srcType);
-  transfers[1].srcIndex.push_back(dstIndex);
-  transfers[1].dstIndex.push_back(srcIndex);
-
-  // Either perform (local read + remote write), or (remote read + local write)
-  ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
-  transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
-  transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
-  transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
-  transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
-  transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-  transfers[1].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-
-  // Remove (DST->SRC) if not bidirectional
-  transfers.resize(isBidirectional + 1);
-
-  // Abort if executing on NUMA node with no CPUs
-  for (int i = 0; i <= isBidirectional; i++)
-  {
-    if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
-      return 0;
-
-#if defined(__NVCC__)
-    // NVIDIA platform cannot access GPU memory directly from CPU executors
-    if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
-        return 0;
-#endif
-  }
-
-  ExecuteTransfers(ev, 0, N, transfers, false);
-
-  // Collect aggregate bandwidth
-  double totalBandwidth = 0;
-  for (int i = 0; i <= isBidirectional; i++)
-  {
-    double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations);
-    double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
-    totalBandwidth += transferBandwidthGbs;
-  }
-  return totalBandwidth;
-}
-
-void Transfer::PrepareSubExecParams(EnvVars const& ev)
-{
-  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
-  // Figure out the sub-array each subExecutor works on for this Transfer
-  // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
-  //   except the very last one, for alignment reasons
-  size_t const N              = this->numBytesActual / sizeof(float);
-  int    const initOffset     = ev.byteOffset / sizeof(float);
-  int    const targetMultiple = ev.blockBytes / sizeof(float);
-
-  // In some cases, there may not be enough data for all subExectors
-  int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
-  this->subExecParam.clear();
-  this->subExecParam.resize(this->numSubExecs);
-
-  size_t assigned = 0;
-  for (int i = 0; i < this->numSubExecs; ++i)
-  {
-    int    const subExecLeft = std::max(0, maxSubExecToUse - i);
-    size_t const leftover    = N - assigned;
-    size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
-
-    SubExecParam& p = this->subExecParam[i];
-    p.N             = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
-    p.numSrcs       = this->numSrcs;
-    p.numDsts       = this->numDsts;
-    for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc)
-      p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset;
-    for (int iDst = 0; iDst < this->numDsts; ++iDst)
-      p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
-
-    if (ev.enableDebug)
-    {
-      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
-             this->transferIndex, i, p.N, assigned, assigned + p.N);
-    }
-
-    p.startCycle = 0;
-    p.stopCycle  = 0;
-    assigned += p.N;
-  }
-
-  this->transferTime = 0.0;
-}
-
-void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
-{
-  size_t N = buffer.size();
-  if (bufferIdx >= 0)
-  {
-    size_t patternLen = ev.fillPattern.size();
-    if (patternLen > 0)
-    {
-      for (size_t i = 0; i < N; ++i)
-        buffer[i] = ev.fillPattern[i % patternLen];
-    }
-    else
-    {
-      for (size_t i = 0; i < N; ++i)
-        buffer[i] = PrepSrcValue(bufferIdx, i);
-    }
-  }
-  else // Destination buffer
-  {
-    if (this->numSrcs == 0)
-    {
-      // Note: 0x75757575 = 13323083.0
-      memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
-    }
-    else
-    {
-      PrepareReference(ev, buffer, 0);
-
-      if (this->numSrcs > 1)
-      {
-        std::vector<float> temp(N);
-        for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
-        {
-          PrepareReference(ev, temp, srcIdx);
-          for (int i = 0; i < N; ++i)
-          {
-            buffer[i] += temp[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-bool Transfer::PrepareSrc(EnvVars const& ev)
-{
-  if (this->numSrcs == 0) return true;
-  size_t const N = this->numBytesActual / sizeof(float);
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  std::vector<float> reference(N);
-  for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
-  {
-    float* srcPtr = this->srcMem[srcIdx] + initOffset;
-    PrepareReference(ev, reference, srcIdx);
-
-    // Initialize source memory array with reference pattern
-    if (IsGpuType(this->srcType[srcIdx]))
-    {
-      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
-      HIP_CALL(hipSetDevice(deviceIdx));
-      if (ev.usePrepSrcKernel)
-        PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx);
-      else
-        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-    }
-    else if (IsCpuType(this->srcType[srcIdx]))
-    {
-      memcpy(srcPtr, reference.data(), this->numBytesActual);
-    }
-
-    // Perform check just to make sure that data has been copied properly
-    float* srcCheckPtr = srcPtr;
-    std::vector<float> srcCopy(N);
-    if (IsGpuType(this->srcType[srcIdx]))
-    {
-      if (!ev.validateDirect)
-      {
-        HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
-        HIP_CALL(hipDeviceSynchronize());
-        srcCheckPtr = srcCopy.data();
-      }
-    }
-
-    for (size_t i = 0; i < N; ++i)
-    {
-      if (reference[i] != srcCheckPtr[i])
-      {
-        printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
-#if !defined(__NVCC__)
-        float const val = this->srcMem[srcIdx][initOffset + i];
-        printf("[ERROR] SRC %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-               srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
-#else
-        printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
-#endif
-        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
-        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
-               this->transferIndex,
-               this->SrcToStr().c_str(),
-               ExeTypeStr[this->exeType], this->exeIndex,
-               this->numSubExecs,
-               this->DstToStr().c_str());
-        if (!ev.continueOnError)
-          exit(1);
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-void Transfer::ValidateDst(EnvVars const& ev)
-{
-  if (this->numDsts == 0) return;
-  size_t const N = this->numBytesActual / sizeof(float);
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  std::vector<float> reference(N);
-  PrepareReference(ev, reference, -1);
-
-  std::vector<float> hostBuffer(N);
-  for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
-  {
-    float* output;
-    if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
-    {
-      output = this->dstMem[dstIdx] + initOffset;
-    }
-    else
-    {
-      int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
-      HIP_CALL(hipSetDevice(deviceIdx));
-      HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-      output = hostBuffer.data();
-    }
-
-    for (size_t i = 0; i < N; ++i)
-    {
-      if (reference[i] != output[i])
-      {
-        printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
-        for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
-        {
-          float srcVal;
-          HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
-#if !defined(__NVCC__)
-          float val = this->srcMem[srcIdx][initOffset + i];
-          printf("[ERROR] SRC %02dD  value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-                 srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
-#else
-          printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
-#endif
-        }
-        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
-#if !defined(__NVCC__)
-        float dstVal = this->dstMem[dstIdx][initOffset + i];
-        printf("[ERROR] DST %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-               dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
-#else
-        printf("[ERROR] DST %02d   value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
-#endif
-        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
-               this->transferIndex,
-               this->SrcToStr().c_str(),
-               ExeTypeStr[this->exeType], this->exeIndex,
-               this->numSubExecs,
-               this->DstToStr().c_str());
-        if (!ev.continueOnError)
-          exit(1);
-        else
-          break;
-      }
-    }
-  }
-}
-
-std::string Transfer::SrcToStr() const
-{
-  if (numSrcs == 0) return "N";
-  std::stringstream ss;
-  for (int i = 0; i < numSrcs; ++i)
-    ss << MemTypeStr[srcType[i]] << srcIndex[i];
-  return ss.str();
-}
-
-std::string Transfer::DstToStr() const
-{
-  if (numDsts == 0) return "N";
-  std::stringstream ss;
-  for (int i = 0; i < numDsts; ++i)
-    ss << MemTypeStr[dstType[i]] << dstIndex[i];
-  return ss.str();
-}
-
-// NOTE: This is a stop-gap solution until HIP provides wallclock values
-int GetWallClockRate(int deviceId)
-{
-  static std::vector<int> wallClockPerDeviceMhz;
-
-  if (wallClockPerDeviceMhz.size() == 0)
-  {
-    int numGpuDevices;
-    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-    wallClockPerDeviceMhz.resize(numGpuDevices);
-
-    for (int i = 0; i < numGpuDevices; i++)
-    {
-#if defined(__NVCC__)
-      int value = 1410000;
-      //HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
-      //value *= 1000;
-#else
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, i));
-      int value = 25000;
-      switch (prop.gcnArch)
-      {
-      case 906: case 910: value = 25000; break;
-      default:
-        printf("Unrecognized GCN arch %d\n", prop.gcnArch);
-      }
-#endif
-      wallClockPerDeviceMhz[i] = value;
-    }
-  }
-  return wallClockPerDeviceMhz[deviceId];
-}
-
-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
-{
-  ev.DisplaySweepEnvVars();
-
-  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
-  std::vector<std::pair<ExeType, int>> exeList;
-  for (auto exe : ev.sweepExe)
-  {
-    ExeType const exeType = CharToExeType(exe);
-    if (IsGpuType(exeType))
-    {
-      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
-        exeList.push_back(std::make_pair(exeType, exeIndex));
-    }
-    else if (IsCpuType(exeType))
-    {
-      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
-      {
-        // Skip NUMA nodes that have no CPUs (e.g. CXL)
-        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
-        exeList.push_back(std::make_pair(exeType, exeIndex));
-      }
-    }
-  }
-  int numExes = exeList.size();
-
-  std::vector<std::pair<MemType, int>> srcList;
-  for (auto src : ev.sweepSrc)
-  {
-    MemType const srcType = CharToMemType(src);
-    int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
-
-    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
-      srcList.push_back(std::make_pair(srcType, srcIndex));
-  }
-  int numSrcs = srcList.size();
-
-
-  std::vector<std::pair<MemType, int>> dstList;
-  for (auto dst : ev.sweepDst)
-  {
-    MemType const dstType = CharToMemType(dst);
-    int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
-
-    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
-      dstList.push_back(std::make_pair(dstType, dstIndex));
-  }
-  int numDsts = dstList.size();
-
-  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
-  struct TransferInfo
-  {
-    MemType srcType; int srcIndex;
-    ExeType exeType; int exeIndex;
-    MemType dstType; int dstIndex;
-  };
-
-  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
-  bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
-
-  std::vector<TransferInfo> possibleTransfers;
-  TransferInfo tinfo;
-  for (int i = 0; i < numExes; ++i)
-  {
-    // Skip CPU executors if XGMI link must be used
-    if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
-    tinfo.exeType  = exeList[i].first;
-    tinfo.exeIndex = exeList[i].second;
-
-    bool isXgmiSrc  = false;
-    int  numHopsSrc = 0;
-    for (int j = 0; j < numSrcs; ++j)
-    {
-      if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
-      {
-        if (exeList[i].second != srcList[j].second)
-        {
-#if defined(__NVCC__)
-          isXgmiSrc = false;
-#else
-          uint32_t exeToSrcLinkType, exeToSrcHopCount;
-          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
-                                                RemappedIndex(srcList[j].second, false),
-                                                &exeToSrcLinkType,
-                                                &exeToSrcHopCount));
-          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
-#endif
-        }
-        else
-        {
-          isXgmiSrc = true;
-          numHopsSrc = 0;
-        }
-
-        // Skip this SRC if it is not XGMI but only XGMI links may be used
-        if (useXgmiOnly && !isXgmiSrc) continue;
-
-        // Skip this SRC if XGMI distance is already past limit
-        if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
-      }
-      else if (useXgmiOnly) continue;
-
-      tinfo.srcType  = srcList[j].first;
-      tinfo.srcIndex = srcList[j].second;
-
-      bool isXgmiDst = false;
-      int  numHopsDst = 0;
-      for (int k = 0; k < numDsts; ++k)
-      {
-        if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
-        {
-          if (exeList[i].second != dstList[k].second)
-          {
-#if defined(__NVCC__)
-            isXgmiSrc = false;
-#else
-            uint32_t exeToDstLinkType, exeToDstHopCount;
-            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
-                                                  RemappedIndex(dstList[k].second, false),
-                                                  &exeToDstLinkType,
-                                                  &exeToDstHopCount));
-            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-            if (isXgmiDst) numHopsDst = exeToDstHopCount;
-#endif
-          }
-          else
-          {
-            isXgmiDst = true;
-            numHopsDst = 0;
-          }
-        }
-
-        // Skip this DST if it is not XGMI but only XGMI links may be used
-        if (useXgmiOnly && !isXgmiDst) continue;
-
-        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
-        if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
-
-        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
-        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
-
-#if defined(__NVCC__)
-        // Skip CPU executors on GPU memory on NVIDIA platform
-        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
-          continue;
-#endif
-
-        tinfo.dstType  = dstList[k].first;
-        tinfo.dstIndex = dstList[k].second;
-
-        possibleTransfers.push_back(tinfo);
-      }
-    }
-  }
-
-  int const numPossible = (int)possibleTransfers.size();
-  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
-
-  if (ev.sweepMin > numPossible)
-  {
-    printf("No valid test configurations exist\n");
-    return;
-  }
-
-  if (ev.outputToCsv)
-  {
-    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
-           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
-  }
-
-  int numTestsRun = 0;
-  int M = ev.sweepMin;
-  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
-  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
-
-  // Log sweep to configuration file
-  FILE *fp = fopen("lastSweep.cfg", "w");
-  if (!fp)
-  {
-    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
-    exit(1);
-  }
-
-  // Create bitmask of numPossible triplets, of which M will be chosen
-  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
-  auto cpuStart = std::chrono::high_resolution_clock::now();
-  while (1)
-  {
-    if (isRandom)
-    {
-      // Pick random number of simultaneous transfers to execute
-      // NOTE: This currently skews distribution due to some #s having more possibilities than others
-      M = distribution(*ev.generator);
-
-      // Generate a random bitmask
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-      std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
-    }
-
-    // Convert bitmask to list of Transfers
-    std::vector<Transfer> transfers;
-    for (int value = 0; value < numPossible; ++value)
-    {
-      if (bitmask[value])
-      {
-        // Convert integer value to (SRC->EXE->DST) triplet
-        Transfer transfer;
-        transfer.numSrcs        = 1;
-        transfer.numDsts        = 1;
-        transfer.srcType        = {possibleTransfers[value].srcType};
-        transfer.srcIndex       = {possibleTransfers[value].srcIndex};
-        transfer.exeType        = possibleTransfers[value].exeType;
-        transfer.exeIndex       = possibleTransfers[value].exeIndex;
-        transfer.dstType        = {possibleTransfers[value].dstType};
-        transfer.dstIndex       = {possibleTransfers[value].dstIndex};
-        transfer.numSubExecs    = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
-        transfer.transferIndex  = transfers.size();
-        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
-        transfers.push_back(transfer);
-      }
-    }
-
-    LogTransfers(fp, ++numTestsRun, transfers);
-    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
-
-    // Check for test limit
-    if (numTestsRun == ev.sweepTestLimit)
-    {
-      printf("Test limit reached\n");
-      break;
-    }
-
-    // Check for time limit
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
-    {
-      printf("Time limit exceeded\n");
-      break;
-    }
-
-    // Increment bitmask if not random sweep
-    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
-    {
-      M++;
-      // Check for completion
-      if (M > maxParallelTransfers)
-      {
-        printf("Sweep complete\n");
-        break;
-      }
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-    }
-  }
-  fclose(fp);
-}
-
-void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
-{
-  fprintf(fp, "# Test %d\n", testNum);
-  fprintf(fp, "%d", -1 * (int)transfers.size());
-  for (auto const& transfer : transfers)
-  {
-    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
-            MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
-            ExeTypeStr[transfer.exeType],    transfer.exeIndex,
-            MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
-            transfer.numSubExecs,
-            transfer.numBytes);
-  }
-  fprintf(fp, "\n");
-  fflush(fp);
-}
-
-std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
-{
-  std::stringstream ss;
-  for (int i = 0; i < strVector.size(); ++i)
-  {
-    if (i) ss << " ";
-    ss << (strVector[i] + initOffset);
-  }
-  return ss.str();
-}

From 7c2fee38926c1f4a8f8b5de8f2e967ebb2b229ed Mon Sep 17 00:00:00 2001
From: Manoj S K <Manoj.SK@amd.com>
Date: Thu, 3 Aug 2023 09:07:11 +0000
Subject: [PATCH 5/7] transfer bench integration

---
 rvs/conf/tfb_single.conf              | 37 +++++++++++++++++++++++++++
 transferbench.so/CMakeLists.txt       |  5 ++--
 transferbench.so/include/action.h     |  2 +-
 transferbench.so/include/tfb_worker.h |  2 +-
 transferbench.so/src/action.cpp       | 23 +++++------------
 transferbench.so/src/rvs_module.cpp   |  2 +-
 transferbench.so/src/tfb_worker.cpp   | 15 ++++++-----
 7 files changed, 57 insertions(+), 29 deletions(-)
 create mode 100644 rvs/conf/tfb_single.conf

diff --git a/rvs/conf/tfb_single.conf b/rvs/conf/tfb_single.conf
new file mode 100644
index 00000000..bd54f303
--- /dev/null
+++ b/rvs/conf/tfb_single.conf
@@ -0,0 +1,37 @@
+# ################################################################################
+# #
+# # Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+# #
+# # MIT LICENSE:
+# # Permission is hereby granted, free of charge, to any person obtaining a copy of
+# # this software and associated documentation files (the "Software"), to deal in
+# # the Software without restriction, including without limitation the rights to
+# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+# # of the Software, and to permit persons to whom the Software is furnished to do
+# # so, subject to the following conditions:
+# #
+# # The above copyright notice and this permission notice shall be included in all
+# # copies or substantial portions of the Software.
+# #
+# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# # SOFTWARE.
+# #
+# ###############################################################################
+
+
+# Hip test
+#
+# Preconditions:
+#   sudo ./rvs -c conf/tfb_single.conf -d 3
+#
+# Expected result:
+#   run transfer bench tests
+actions:
+- name: p2p-benchmarks
+  module: transfer-bench 
+  transfer-type: p2p 
diff --git a/transferbench.so/CMakeLists.txt b/transferbench.so/CMakeLists.txt
index 21165721..ee2002d9 100644
--- a/transferbench.so/CMakeLists.txt
+++ b/transferbench.so/CMakeLists.txt
@@ -151,7 +151,7 @@ include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${R
 # Add directories to look for library files to link
 link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
 ## additional libraries
-set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so numa hsa-runtime64 )
+set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so numa hsa-runtime64 )
 
 ## define source files
 set(SOURCES src/rvs_module.cpp src/action.cpp src/tfb_worker.cpp)
@@ -162,7 +162,8 @@ set_target_properties(${RVS_TARGET} PROPERTIES
         SUFFIX .so.${LIB_VERSION_STRING}
         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
 target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
-add_dependencies(${RVS_TARGET} rvslibrt rvslib)
+#add_dependencies(${RVS_TARGET} rvslibrt rvslib)
+add_dependencies(${RVS_TARGET} rvslib)
 
 add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
 COMMAND ln -fs ./lib${RVS}.so.${LIB_VERSION_STRING} lib${RVS}.so.${VERSION_MAJOR} WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
diff --git a/transferbench.so/include/action.h b/transferbench.so/include/action.h
index 4c2518a5..7fdf2b35 100644
--- a/transferbench.so/include/action.h
+++ b/transferbench.so/include/action.h
@@ -80,7 +80,7 @@ class tfb_action: public rvs::actionbase {
   * @brief gets the number of ROCm compatible AMD GPUs
   * @return run number of GPUs
   */
-    int run_hip_tests();
+    int run_transferbench(void);
 };
 
 #endif  // HIPTEST_SO_INCLUDE_ACTION_H_
diff --git a/transferbench.so/include/tfb_worker.h b/transferbench.so/include/tfb_worker.h
index dba096ff..10b1cfac 100644
--- a/transferbench.so/include/tfb_worker.h
+++ b/transferbench.so/include/tfb_worker.h
@@ -59,7 +59,7 @@ class tfbWorker : public rvs::ThreadBase {
     void set_transfer_type(std::string type) { m_tfr_type = type; }
 
     const std::string& get_transfer_type(void) { return m_tfr_type; }
-    bool TfbRun(int &error, std::string &errdesc);
+    int TfbRun(int &error, std::string &errdesc);
  protected:
     virtual void run(void);
  protected:
diff --git a/transferbench.so/src/action.cpp b/transferbench.so/src/action.cpp
index 4b152d56..0d0d4931 100644
--- a/transferbench.so/src/action.cpp
+++ b/transferbench.so/src/action.cpp
@@ -45,6 +45,8 @@ using std::map;
 using std::regex;
 
 #define RVS_CONF_TFB_TYPE_KEY       "transfer-type"
+#define MODULE_NAME                 "transfer-bench"
+#define MODULE_NAME_CAPS            "TRANSFER-BENCH"
 #define TFB_DEFAULT_TYPE            "p2p"
 #define TFB_NO_COMPATIBLE_GPUS          "No AMD compatible GPU found!"
 
@@ -73,7 +75,7 @@ tfb_action::~tfb_action() {
 bool tfb_action::start_tfb_runners() {
     size_t k = 0;
     // one worker sufficient, as test runner
-    hipTestWorker worker;
+    tfbWorker worker;
     worker.set_name(action_name);
     worker.set_transfer_type(m_transfer_type);
     worker.start();
@@ -95,7 +97,7 @@ bool tfb_action::get_all_tfb_config_keys(void) {
     if (property_get<std::string>(RVS_CONF_TFB_TYPE_KEY, &m_transfer_type,
             TFB_DEFAULT_TYPE)) {
          msg = "invalid '" +
-         std::string(RVS_CONF_GST_OPS_TYPE) + "' key value";
+         std::string(RVS_CONF_TFB_TYPE_KEY) + "' key value";
          rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
          bsts = false;
     }
@@ -103,19 +105,6 @@ bool tfb_action::get_all_tfb_config_keys(void) {
     return bsts;
 }
 
-/**
- * @brief reads all common configuration keys from
- * the module's properties collection
- * @return true if no fatal error occured, false otherwise
- */
-bool tfb_action::get_all_common_config_keys(void) {
-    string msg, sdevid, sdev;
-	bsts = false;
-    }
-
-    return bsts;
-}
-
 /**
  * @brief reads all common configuration keys from
  * the module's properties collection
@@ -142,7 +131,7 @@ int tfb_action::get_num_amd_gpu_devices(void) {
 
     hipGetDeviceCount(&hip_num_gpu_devices);
     if (hip_num_gpu_devices == 0) {  // no AMD compatible GPU
-        msg = action_name + " " + MODULE_NAME + " " + HIPTEST_NO_COMPATIBLE_GPUS;
+        msg = action_name + " " + MODULE_NAME + " " + TFB_NO_COMPATIBLE_GPUS;
         rvs::lp::Log(msg, rvs::logerror);
 
         if (bjson) {
@@ -158,7 +147,7 @@ int tfb_action::get_num_amd_gpu_devices(void) {
                 return -1;
             }
 
-            rvs::lp::AddString(json_root_node, "ERROR", HIPTEST_NO_COMPATIBLE_GPUS);
+            rvs::lp::AddString(json_root_node, "ERROR", TFB_NO_COMPATIBLE_GPUS);
             rvs::lp::LogRecordFlush(json_root_node, rvs::loginfo);
         }
         return 0;
diff --git a/transferbench.so/src/rvs_module.cpp b/transferbench.so/src/rvs_module.cpp
index 3fbec70c..f96a16f1 100644
--- a/transferbench.so/src/rvs_module.cpp
+++ b/transferbench.so/src/rvs_module.cpp
@@ -73,7 +73,7 @@ extern "C" int rvs_module_terminate(void) {
 }
 
 extern "C" void* rvs_module_action_create(void) {
-    return static_cast<void*>(new hiptest_action);
+    return static_cast<void*>(new tfb_action);
 }
 
 extern "C" int   rvs_module_action_destroy(void* pAction) {
diff --git a/transferbench.so/src/tfb_worker.cpp b/transferbench.so/src/tfb_worker.cpp
index a5f21993..5ec11185 100644
--- a/transferbench.so/src/tfb_worker.cpp
+++ b/transferbench.so/src/tfb_worker.cpp
@@ -27,16 +27,17 @@ THE SOFTWARE.
 #include <random>
 #include <stack>
 #include <thread>
-#include "tfb_worker.h"
-#include "TransferBench.hpp"
-#include "GetClosestNumaNode.hpp"
-
+#include "include/tfb_worker.h"
+#include "include/rvsloglp.h"
+#include "include/TransferBench.hpp"
+#include "include/GetClosestNumaNode.hpp"
 
+static const std::string MODULE_NAME{"transfer-bench"};
 /**
  * @brief performs the stress test on the given GPU
  */
 void tfbWorker::run() {
-    string msg, err_description;
+	std::string msg, err_description;
     int error = 0;
 
 
@@ -50,7 +51,7 @@ void tfbWorker::run() {
 
     // GPU was not able to do the processing (HIP/rocBlas error(s) occurred)
     if (error) {
-        string msg = "[" + action_name + "] " + MODULE_NAME + " "
+	    std::string msg = "[" + action_name + "] " + MODULE_NAME + " "
                          + err_description;
         rvs::lp::Log(msg, rvs::logerror);
         return;
@@ -59,7 +60,7 @@ void tfbWorker::run() {
 }
 
 
-int tfbWorker::TfbRun(int &err, string &errmsg)
+int tfbWorker::TfbRun(int &err, std::string &errmsg)
 {
   // Check for NUMA library support
   if (numa_available() == -1)

From f48b9e5740bed4260dfdc5a0708e64ae5a4f450d Mon Sep 17 00:00:00 2001
From: Manoj S K <Manoj.SK@amd.com>
Date: Thu, 3 Aug 2023 10:51:56 +0000
Subject: [PATCH 6/7] rename

---
 CMakeLists.txt                        |    2 +-
 rvs/.rvsmodules.config                |    1 +
 rvs/conf/tfb_single.conf              |    2 +-
 rvs/src/rvsmodule.cpp                 |    4 +-
 tfb.so/CMakeLists.txt                 |  183 +++
 tfb.so/include/Compatibility.hpp      |   93 ++
 tfb.so/include/EnvVars.hpp            |  510 +++++++
 tfb.so/include/GetClosestNumaNode.hpp |  149 ++
 tfb.so/include/Kernels.hpp            |  409 ++++++
 tfb.so/include/TransferBench.hpp      |  198 +++
 tfb.so/include/action.h               |   86 ++
 tfb.so/include/rvs_module.h           |   31 +
 tfb.so/include/tfb_worker.h           |   72 +
 tfb.so/src/.gitignore                 |    1 +
 tfb.so/src/TransferBench.cpp          | 1814 +++++++++++++++++++++++++
 tfb.so/src/action.cpp                 |  204 +++
 tfb.so/src/hiptest_worker.cpp         |   96 ++
 tfb.so/src/rvs_module.cpp             |   91 ++
 tfb.so/src/tfb_worker.cpp             |  104 ++
 tfb.so/tests.cmake                    |   27 +
 tfb.so/transferbench/CMakeLists.txt   |   31 +
 21 files changed, 4105 insertions(+), 3 deletions(-)
 create mode 100644 tfb.so/CMakeLists.txt
 create mode 100644 tfb.so/include/Compatibility.hpp
 create mode 100644 tfb.so/include/EnvVars.hpp
 create mode 100644 tfb.so/include/GetClosestNumaNode.hpp
 create mode 100644 tfb.so/include/Kernels.hpp
 create mode 100644 tfb.so/include/TransferBench.hpp
 create mode 100644 tfb.so/include/action.h
 create mode 100644 tfb.so/include/rvs_module.h
 create mode 100644 tfb.so/include/tfb_worker.h
 create mode 100644 tfb.so/src/.gitignore
 create mode 100644 tfb.so/src/TransferBench.cpp
 create mode 100644 tfb.so/src/action.cpp
 create mode 100644 tfb.so/src/hiptest_worker.cpp
 create mode 100644 tfb.so/src/rvs_module.cpp
 create mode 100644 tfb.so/src/tfb_worker.cpp
 create mode 100644 tfb.so/tests.cmake
 create mode 100644 tfb.so/transferbench/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e54fd195..0ddd669f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -474,7 +474,7 @@ add_subdirectory(rcqt.so)
 add_subdirectory(smqt.so)
 add_subdirectory(mem.so)
 add_subdirectory(babel.so)
-add_subdirectory(transferbench.so)
+add_subdirectory(tfb.so)
 add_subdirectory(perf.so)
 
 if (RVS_BUILD_TESTS)
diff --git a/rvs/.rvsmodules.config b/rvs/.rvsmodules.config
index 047d0cac..af55776c 100644
--- a/rvs/.rvsmodules.config
+++ b/rvs/.rvsmodules.config
@@ -12,3 +12,4 @@ iet: libiet.so
 mem: libmem.so
 babel: libbabel.so
 perf: libperf.so
+tfb:  libtfb.so
diff --git a/rvs/conf/tfb_single.conf b/rvs/conf/tfb_single.conf
index bd54f303..4d7324c6 100644
--- a/rvs/conf/tfb_single.conf
+++ b/rvs/conf/tfb_single.conf
@@ -33,5 +33,5 @@
 #   run transfer bench tests
 actions:
 - name: p2p-benchmarks
-  module: transfer-bench 
+  module: tfb 
   transfer-type: p2p 
diff --git a/rvs/src/rvsmodule.cpp b/rvs/src/rvsmodule.cpp
index 0aebb1fe..cdf0eec6 100644
--- a/rvs/src/rvsmodule.cpp
+++ b/rvs/src/rvsmodule.cpp
@@ -168,6 +168,7 @@ rvs::module* rvs::module::find_create_module(const char* name) {
       libpath += "../lib/rvs/";
     }
     string sofullname(libpath + it->second);
+    std::cout << "MANOJ:::: " << sofullname << std::endl;
     void* psolib = dlopen(sofullname.c_str(), RTLD_NOW);
     // error?
     if (!psolib) {
@@ -227,7 +228,8 @@ rvs::module* rvs::module::find_create_module(const char* name) {
     }
 
     // add to map
-    modulemap.insert(t_mmpair(name, m));
+    std::cout << "module name manoj is :" <<  name;
+
   } else {
     m = it->second;
   }
diff --git a/tfb.so/CMakeLists.txt b/tfb.so/CMakeLists.txt
new file mode 100644
index 00000000..ee2002d9
--- /dev/null
+++ b/tfb.so/CMakeLists.txt
@@ -0,0 +1,183 @@
+################################################################################
+##
+## Copyright (c) 2018 ROCm Developer Tools
+##
+## MIT LICENSE:
+## Permission is hereby granted, free of charge, to any person obtaining a copy of
+## this software and associated documentation files (the "Software"), to deal in
+## the Software without restriction, including without limitation the rights to
+## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+## of the Software, and to permit persons to whom the Software is furnished to do
+## so, subject to the following conditions:
+##
+## The above copyright notice and this permission notice shall be included in all
+## copies or substantial portions of the Software.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+## SOFTWARE.
+##
+################################################################################
+
+cmake_minimum_required ( VERSION 3.5.0 )
+if ( ${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
+  message(FATAL "In-source build is not allowed")
+endif ()
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+
+set ( RVS "tfb" )
+set ( RVS_PACKAGE "rvs-roct" )
+set ( RVS_COMPONENT "lib${RVS}" )
+set ( RVS_TARGET "${RVS}" )
+
+project ( ${RVS_TARGET} )
+
+message(STATUS "MODULE: ${RVS}")
+add_compile_options(-std=c++11)
+add_compile_options(-Wall )
+if (RVS_COVERAGE)
+  add_compile_options(-o0 -fprofile-arcs -ftest-coverage)
+  set(CMAKE_EXE_LINKER_FLAGS "--coverage")
+  set(CMAKE_SHARED_LINKER_FLAGS "--coverage")
+endif()
+
+## Set default module path if not already set
+if ( NOT DEFINED CMAKE_MODULE_PATH )
+    set ( CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake_modules/" )
+endif ()
+
+## Include common cmake modules
+include ( utils )
+
+## Setup the package version.
+get_version ( "0.0.0" )
+
+set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} )
+set ( BUILD_VERSION_MINOR ${VERSION_MINOR} )
+set ( BUILD_VERSION_PATCH ${VERSION_PATCH} )
+set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
+
+if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" )
+    set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" )
+endif ()
+set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
+
+## make version numbers visible to C code
+add_compile_options(-DBUILD_VERSION_MAJOR=${VERSION_MAJOR})
+add_compile_options(-DBUILD_VERSION_MINOR=${VERSION_MINOR})
+add_compile_options(-DBUILD_VERSION_PATCH=${VERSION_PATCH})
+add_compile_options(-DLIB_VERSION_STRING="${LIB_VERSION_STRING}")
+add_compile_options(-DBUILD_VERSION_STRING="${BUILD_VERSION_STRING}")
+
+set(ROCBLAS_LIB "rocblas")
+set(HIP_HCC_LIB "amdhip64")
+
+# Determine HSA_PATH
+if(NOT DEFINED HIPCC_PATH)
+  if(NOT DEFINED ENV{HIPCC_PATH})
+    set(HIPCC_PATH "${ROCM_PATH}/hip" CACHE PATH "Path to which hipcc runtime has been installed")
+     else()
+       set(HIPCC_PATH $ENV{HIPCC_PATH} CACHE PATH "Path to which hipcc runtime has been installed")
+     endif()
+endif()
+
+# Determine HSA_PATH
+if(NOT DEFINED HSA_PATH)
+     if(NOT DEFINED ENV{HSA_PATH})
+	     set(HSA_PATH "${ROCM_PATH}/hsa" CACHE PATH "Path to which HSA runtime has been installed")
+     else()
+          set(HSA_PATH $ENV{HSA_PATH} CACHE PATH "Path to which HSA runtime has been installed")
+     endif()
+endif()
+
+# Add HIP_VERSION to CMAKE_<LANG>_FLAGS
+set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}")
+
+set(HIP_HCC_BUILD_FLAGS)
+set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include ${ASAN_CXX_FLAGS}")
+
+# Set compiler and compiler flags
+set(CMAKE_CXX_COMPILER "${HIPCC_PATH}/bin/hipcc")
+set(CMAKE_C_COMPILER   "${HIPCC_PATH}/bin/hipcc")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS} -L${ROCM_PATH}/lib")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
+
+if(BUILD_ADDRESS_SANITIZER)
+  execute_process(COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.asan-x86_64.so
+            OUTPUT_VARIABLE ASAN_LIB_FULL_PATH)
+  get_filename_component(ASAN_LIB_PATH ${ASAN_LIB_FULL_PATH} DIRECTORY)
+endif()
+
+# Determine Roc Runtime header files are accessible
+if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h)
+  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
+  RETURN()
+endif()
+
+if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h)
+  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
+  RETURN()
+endif()
+
+# Determine Roc Runtime header files are accessible
+if(DEFINED RVS_ROCMSMI)
+  if(NOT RVS_ROCMSMI EQUAL 1)
+    if(NOT EXISTS ${ROCBLAS_INC_DIR}/rocblas.h)
+    message("ERROR: rocBLAS headers can't be found under specified path. Please set ROCBLAS_INC_DIR path. Current value is : " ${ROCBLAS_INC_DIR})
+    RETURN()
+    endif()
+
+    if(NOT EXISTS "${ROCBLAS_LIB_DIR}/lib${ROCBLAS_LIB}.so")
+      message("ERROR: rocBLAS library can't be found under specified path. Please set ROCBLAS_LIB_DIR path. Current value is : " ${ROCBLAS_LIB_DIR})
+      RETURN()
+    endif()
+  endif()
+endif()
+
+
+if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
+  message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+  RETURN()
+endif()
+
+## define include directories
+include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${ROCM_PATH}/include)
+# Add directories to look for library files to link
+link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
+## additional libraries
+set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so numa hsa-runtime64 )
+
+## define source files
+set(SOURCES src/rvs_module.cpp src/action.cpp src/tfb_worker.cpp)
+
+## define target
+add_library( ${RVS_TARGET} SHARED ${SOURCES})
+set_target_properties(${RVS_TARGET} PROPERTIES
+        SUFFIX .so.${LIB_VERSION_STRING}
+        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
+#add_dependencies(${RVS_TARGET} rvslibrt rvslib)
+add_dependencies(${RVS_TARGET} rvslib)
+
+add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+COMMAND ln -fs ./lib${RVS}.so.${LIB_VERSION_STRING} lib${RVS}.so.${VERSION_MAJOR} WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+COMMAND ln -fs ./lib${RVS}.so.${VERSION_MAJOR} lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+)
+
+install(TARGETS ${RVS_TARGET} LIBRARY DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
+install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR}" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
+install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
+
+# TEST SECTION
+#if (RVS_BUILD_TESTS)
+#  add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+#  COMMAND ln -fs ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR} ${RVS_BINTEST_FOLDER}/lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+#  )
+#  include(${CMAKE_CURRENT_SOURCE_DIR}/tests.cmake)
+#endif()
diff --git a/tfb.so/include/Compatibility.hpp b/tfb.so/include/Compatibility.hpp
new file mode 100644
index 00000000..5e76cf50
--- /dev/null
+++ b/tfb.so/include/Compatibility.hpp
@@ -0,0 +1,93 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__NVCC__)
+
+#include <cuda_runtime.h>
+
+// ROCm specific
+#define __builtin_amdgcn_s_memrealtime                     clock64
+#define gcnArchName                                        name
+
+// Datatypes
+#define hipDeviceProp_t                                    cudaDeviceProp
+#define hipError_t                                         cudaError_t
+#define hipEvent_t                                         cudaEvent_t
+#define hipStream_t                                        cudaStream_t
+
+// Enumerations
+#define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
+#define hipDeviceAttributeMaxSharedMemoryPerMultiprocessor cudaDevAttrMaxSharedMemoryPerMultiprocessor
+#define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
+#define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
+#define hipFuncCachePreferShared                           cudaFuncCachePreferShared
+#define hipMemcpyDefault                                   cudaMemcpyDefault
+#define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
+#define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
+#define hipSuccess                                         cudaSuccess
+
+// Functions
+#define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
+#define hipDeviceEnablePeerAccess                          cudaDeviceEnablePeerAccess
+#define hipDeviceGetAttribute                              cudaDeviceGetAttribute
+#define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
+#define hipDeviceSetCacheConfig                            cudaDeviceSetCacheConfig
+#define hipDeviceSynchronize                               cudaDeviceSynchronize
+#define hipEventCreate                                     cudaEventCreate
+#define hipEventDestroy                                    cudaEventDestroy
+#define hipEventElapsedTime                                cudaEventElapsedTime
+#define hipEventRecord                                     cudaEventRecord
+#define hipFree                                            cudaFree
+#define hipGetDeviceCount                                  cudaGetDeviceCount
+#define hipGetDeviceProperties                             cudaGetDeviceProperties
+#define hipGetErrorString                                  cudaGetErrorString
+#define hipHostFree                                        cudaFreeHost
+#define hipHostMalloc                                      cudaMallocHost
+#define hipMalloc                                          cudaMalloc
+#define hipMemcpy                                          cudaMemcpy
+#define hipMemcpyAsync                                     cudaMemcpyAsync
+#define hipMemset                                          cudaMemset
+#define hipMemsetAsync                                     cudaMemsetAsync
+#define hipSetDevice                                       cudaSetDevice
+#define hipStreamCreate                                    cudaStreamCreate
+#define hipStreamDestroy                                   cudaStreamDestroy
+#define hipStreamSynchronize                               cudaStreamSynchronize
+
+// Define float4 addition operator for NVIDIA platform
+__device__ inline float4& operator +=(float4& a, const float4& b)
+{
+  a.x += b.x;
+  a.y += b.y;
+  a.z += b.z;
+  a.w += b.w;
+  return a;
+}
+
+#else
+
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+#include <hsa/hsa_ext_amd.h>
+
+#endif
diff --git a/tfb.so/include/EnvVars.hpp b/tfb.so/include/EnvVars.hpp
new file mode 100644
index 00000000..bac0746e
--- /dev/null
+++ b/tfb.so/include/EnvVars.hpp
@@ -0,0 +1,510 @@
+/*
+Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ENVVARS_HPP
+#define ENVVARS_HPP
+
+#include <algorithm>
+#include <random>
+#include <time.h>
+#include "Compatibility.hpp"
+#include "Kernels.hpp"
+
+#define TB_VERSION "1.22"
+
+extern char const MemTypeStr[];
+extern char const ExeTypeStr[];
+
+enum ConfigModeEnum
+{
+  CFG_FILE  = 0,
+  CFG_P2P   = 1,
+  CFG_SWEEP = 2
+};
+
+// This class manages environment variable that affect TransferBench
+class EnvVars
+{
+public:
+  // Default configuration values
+  int const DEFAULT_NUM_WARMUPS          =  1;
+  int const DEFAULT_NUM_ITERATIONS       = 10;
+  int const DEFAULT_SAMPLING_FACTOR      =  1;
+
+  // Peer-to-peer Benchmark preset defaults
+  int const DEFAULT_P2P_NUM_CPU_SE    = 4;
+
+  // Sweep-preset defaults
+  std::string const DEFAULT_SWEEP_SRC = "CG";
+  std::string const DEFAULT_SWEEP_EXE = "CDG";
+  std::string const DEFAULT_SWEEP_DST = "CG";
+  int const DEFAULT_SWEEP_MIN         = 1;
+  int const DEFAULT_SWEEP_MAX         = 24;
+  int const DEFAULT_SWEEP_TEST_LIMIT  = 0;
+  int const DEFAULT_SWEEP_TIME_LIMIT  = 0;
+
+  // Environment variables
+  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
+  int byteOffset;        // Byte-offset for memory allocations
+  int continueOnError;   // Continue tests even after mismatch detected
+  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
+  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
+  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
+  int numWarmups;        // Number of un-timed warmup iterations to perform
+  int outputToCsv;       // Output in CSV format
+  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
+  int sharedMemBytes;    // Amount of shared memory to use per threadblock
+  int useInteractive;    // Pause for user-input before starting transfer loop
+  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
+  int usePrepSrcKernel;  // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
+  int useSingleStream;   // Use a single stream per GPU GFX executor instead of stream per Transfer
+  int validateDirect;    // Validate GPU destination memory directly instead of staging GPU memory on host
+
+  std::vector<float> fillPattern; // Pattern of floats used to fill source data
+
+  // Environment variables only for Benchmark-preset
+  int useRemoteRead;     // Use destination memory type as executor instead of source memory type
+  int useDmaCopy;        // Use DMA copy instead of GPU copy
+  int numGpuSubExecs;    // Number of GPU subexecutors to use
+  int numCpuSubExecs;    // Number of CPU subexecttors to use
+
+  // Environment variables only for Sweep-preset
+  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
+  int sweepMax;          // Max number of simulatneous Transfers to be executed per test
+  int sweepTestLimit;    // Max number of tests to run during sweep (0 = no limit)
+  int sweepTimeLimit;    // Max number of seconds to run sweep for  (0 = no limit)
+  int sweepXgmiMin;      // Min number of XGMI hops for Transfers
+  int sweepXgmiMax;      // Max number of XGMI hops for Transfers (-1 = no limit)
+  int sweepSeed;         // Random seed to use
+  int sweepRandBytes;    // Whether or not to use random number of bytes per Transfer
+  std::string sweepSrc;  // Set of src memory types to be swept
+  std::string sweepExe;  // Set of executors to be swept
+  std::string sweepDst;  // Set of dst memory types to be swept
+
+  // Developer features
+  int enableDebug;       // Enable debug output
+  int gpuKernel;         // Which GPU kernel to use
+
+  // Used to track current configuration mode
+  ConfigModeEnum configMode;
+
+  // Random generator
+  std::default_random_engine *generator;
+
+  // Track how many CPUs are available per NUMA node
+  std::vector<int> numCpusPerNuma;
+
+  // Constructor that collects values
+  EnvVars()
+  {
+    int maxSharedMemBytes = 0;
+    HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes,
+                                   hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0));
+#if !defined(__NVCC__)
+    int defaultSharedMemBytes = maxSharedMemBytes / 2 + 1;
+#else
+    int defaultSharedMemBytes = 0;
+#endif
+
+    int numDeviceCUs = 0;
+    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0));
+
+    int numDetectedCpus = numa_num_configured_nodes();
+    int numDetectedGpus;
+    HIP_CALL(hipGetDeviceCount(&numDetectedGpus));
+
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, 0));
+    std::string fullName = prop.gcnArchName;
+    std::string archName = fullName.substr(0, fullName.find(':'));
+
+    // Different hardware pick different GPU kernels
+    // This performance difference is generally only noticable when executing fewer CUs
+    int defaultGpuKernel = 0;
+    if      (archName == "gfx906") defaultGpuKernel = 13;
+    else if (archName == "gfx90a") defaultGpuKernel = 9;
+
+    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
+    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    continueOnError   = GetEnvVar("CONTINUE_ON_ERROR"   , 0);
+    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
+    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
+    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
+    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
+    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
+    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
+    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , defaultSharedMemBytes);
+    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
+    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
+    usePrepSrcKernel  = GetEnvVar("USE_PREP_KERNEL"     , 0);
+    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);
+    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
+    enableDebug       = GetEnvVar("DEBUG"               , 0);
+    gpuKernel         = GetEnvVar("GPU_KERNEL"          , defaultGpuKernel);
+
+    // P2P Benchmark related
+    useRemoteRead    = GetEnvVar("USE_REMOTE_READ"      , 0);
+    useDmaCopy       = GetEnvVar("USE_GPU_DMA"          , 0);
+    numGpuSubExecs   = GetEnvVar("NUM_GPU_SE"           , useDmaCopy ? 1 : numDeviceCUs);
+    numCpuSubExecs   = GetEnvVar("NUM_CPU_SE"           , DEFAULT_P2P_NUM_CPU_SE);
+
+    // Sweep related
+    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
+    sweepMax          = GetEnvVar("SWEEP_MAX"           , DEFAULT_SWEEP_MAX);
+    sweepSrc          = GetEnvVar("SWEEP_SRC"           , DEFAULT_SWEEP_SRC);
+    sweepExe          = GetEnvVar("SWEEP_EXE"           , DEFAULT_SWEEP_EXE);
+    sweepDst          = GetEnvVar("SWEEP_DST"           , DEFAULT_SWEEP_DST);
+    sweepTestLimit    = GetEnvVar("SWEEP_TEST_LIMIT"    , DEFAULT_SWEEP_TEST_LIMIT);
+    sweepTimeLimit    = GetEnvVar("SWEEP_TIME_LIMIT"    , DEFAULT_SWEEP_TIME_LIMIT);
+    sweepXgmiMin      = GetEnvVar("SWEEP_XGMI_MIN"      , 0);
+    sweepXgmiMax      = GetEnvVar("SWEEP_XGMI_MAX"      , -1);
+    sweepRandBytes    = GetEnvVar("SWEEP_RAND_BYTES"    , 0);
+
+    // Determine random seed
+    char *sweepSeedStr = getenv("SWEEP_SEED");
+    sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL));
+    generator = new std::default_random_engine(sweepSeed);
+
+    // Check for fill pattern
+    char* pattern = getenv("FILL_PATTERN");
+    if (pattern != NULL)
+    {
+      if (usePrepSrcKernel)
+      {
+        printf("[ERROR] Unable to use FILL_PATTERN and USE_PREP_KERNEL together\n");
+        exit(1);
+      }
+
+      int patternLen = strlen(pattern);
+      if (patternLen % 2)
+      {
+        printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
+        exit(1);
+      }
+
+      // Read in bytes
+      std::vector<unsigned char> bytes;
+      unsigned char val = 0;
+      for (int i = 0; i < patternLen; i++)
+      {
+        if ('0' <= pattern[i] && pattern[i] <= '9')
+          val += (pattern[i] - '0');
+        else if ('A' <= pattern[i] && pattern[i] <= 'F')
+          val += (pattern[i] - 'A' + 10);
+        else if ('a' <= pattern[i] && pattern[i] <= 'f')
+          val += (pattern[i] - 'a' + 10);
+        else
+        {
+          printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F).  (not %c)\n", pattern[i]);
+          exit(1);
+        }
+
+        if (i % 2 == 0)
+          val <<= 4;
+        else
+        {
+          bytes.push_back(val);
+          val = 0;
+        }
+      }
+
+      // Reverse bytes (input is assumed to be given in big-endian)
+      std::reverse(bytes.begin(), bytes.end());
+
+      // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
+      int copies;
+      switch (patternLen % 8)
+      {
+      case 0:  copies = 1; break;
+      case 4:  copies = 2; break;
+      default: copies = 4; break;
+      }
+
+      // Fill floats
+      int numFloats = copies * patternLen / 8;
+      fillPattern.resize(numFloats);
+      unsigned char* rawData = (unsigned char*) fillPattern.data();
+      for (int i = 0; i < numFloats * 4; i++)
+        rawData[i] = bytes[i % bytes.size()];
+    }
+    else fillPattern.clear();
+
+    // Perform some basic validation
+    if (numCpuDevices > numDetectedCpus)
+    {
+      printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
+      exit(1);
+    }
+    if (numGpuDevices > numDetectedGpus)
+    {
+      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
+      exit(1);
+    }
+    if (byteOffset % sizeof(float))
+    {
+      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
+      exit(1);
+    }
+    if (numWarmups < 0)
+    {
+      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
+      exit(1);
+    }
+    if (samplingFactor < 1)
+    {
+      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
+      exit(1);
+    }
+    if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
+    {
+      printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes);
+      exit(1);
+    }
+    if (blockBytes <= 0 || blockBytes % 4)
+    {
+      printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
+      exit(1);
+    }
+
+    if (numGpuSubExecs <= 0)
+    {
+      printf("[ERROR] NUM_GPU_SE must be greater than 0\n");
+      exit(1);
+    }
+
+    if (numCpuSubExecs <= 0)
+    {
+      printf("[ERROR] NUM_CPU_SE must be greater than 0\n");
+      exit(1);
+    }
+
+    for (auto ch : sweepSrc)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+    }
+
+    for (auto ch : sweepDst)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+    }
+
+    for (auto ch : sweepExe)
+    {
+      if (!strchr(ExeTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+    }
+    if (gpuKernel < 0 || gpuKernel > NUM_GPU_KERNELS)
+    {
+      printf("[ERROR] GPU kernel must be between 0 and %d\n", NUM_GPU_KERNELS);
+      exit(1);
+    }
+
+    // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
+    numCpusPerNuma.resize(numDetectedCpus);
+    int const totalCpus = numa_num_configured_cpus();
+    for (int i = 0; i < totalCpus; i++)
+      numCpusPerNuma[numa_node_of_cpu(i)]++;
+
+    // Check for deprecated env vars
+    if (getenv("USE_HIP_CALL"))
+    {
+      printf("[WARN] USE_HIP_CALL has been deprecated.  Please use DMA executor 'D' or set USE_GPU_DMA for P2P-Benchmark preset\n");
+      exit(1);
+    }
+
+    char* enableSdma = getenv("HSA_ENABLE_SDMA");
+    if (enableSdma && !strcmp(enableSdma, "0"))
+    {
+      printf("[WARN] DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0.  Copies will fallback to blit kernels\n");
+    }
+  }
+
+  // Display info on the env vars that can be used
+  static void DisplayUsage()
+  {
+    printf("Environment variables:\n");
+    printf("======================\n");
+    printf(" BLOCK_BYTES=B          - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
+    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
+    printf(" CONTINUE_ON_ERROR      - Continue tests even after mismatch detected\n");
+    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
+    printf(" NUM_GPU_DEVICES=X      - Restrict number of GPUs to X.  May not be greater than # detected HIP devices\n");
+    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
+    printf(" NUM_WARMUPS=W          - Perform W untimed warmup iteration(s) per test\n");
+    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
+    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" SHARED_MEM_BYTES=X     - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
+    printf(" USE_INTERACTIVE        - Pause for user-input before starting transfer loop\n");
+    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
+    printf(" USE_PREP_KERNEL        - Use GPU kernel to initialize source data array pattern\n");
+    printf(" USE_SINGLE_STREAM      - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
+    printf(" VALIDATE_DIRECT        - Validate GPU destination memory directly instead of staging GPU memory on host\n");
+  }
+
+  // Helper macro to switch between CSV and terminal output
+#define PRINT_EV(NAME, VALUE, DESCRIPTION)                              \
+  printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
+
+#define PRINT_ES(NAME, VALUE, DESCRIPTION)                           \
+  printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
+
+  // Display env var settings
+  void DisplayEnvVars() const
+  {
+    if (!outputToCsv)
+    {
+      printf("TransferBench v%s\n", TB_VERSION);
+      printf("=====================================================\n");
+      printf("[Common]\n");
+    }
+    else
+      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
+
+    PRINT_EV("BLOCK_BYTES", blockBytes,
+             std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
+    PRINT_EV("BYTE_OFFSET", byteOffset,
+             std::string("Using byte offset of " + std::to_string(byteOffset)));
+    PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
+             std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
+    PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
+             (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
+    PRINT_EV("GPU_KERNEL", gpuKernel,
+             std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]");
+    PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
+             std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
+    PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
+             std::string("Using ") + std::to_string(numGpuDevices) + " GPU devices");
+    PRINT_EV("NUM_ITERATIONS", numIterations,
+             std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
+             + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
+    PRINT_EV("NUM_WARMUPS", numWarmups,
+             std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
+    PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
+             std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
+    PRINT_EV("USE_INTERACTIVE", useInteractive,
+             std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
+    PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
+             std::string("Use ") + (usePcieIndexing ? "PCIe" : "HIP") + " GPU device indexing");
+    PRINT_EV("USE_PREP_KERNEL", usePrepSrcKernel,
+             std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
+    PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
+             std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
+    PRINT_EV("VALIDATE_DIRECT", validateDirect,
+             std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
+    printf("\n");
+  };
+
+  // Display env var for P2P Benchmark preset
+  void DisplayP2PBenchmarkEnvVars() const
+  {
+    DisplayEnvVars();
+
+    if (!outputToCsv)
+      printf("[P2P Related]\n");
+
+    PRINT_EV("NUM_CPU_SE", numCpuSubExecs,
+             std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
+    PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
+             std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
+    PRINT_EV("USE_GPU_DMA", useDmaCopy,
+             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
+    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
+             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
+    printf("\n");
+  }
+
+  // Display env var settings
+  void DisplaySweepEnvVars() const
+  {
+    DisplayEnvVars();
+
+    if (!outputToCsv)
+      printf("[Sweep Related]\n");
+    PRINT_ES("SWEEP_DST", sweepDst.c_str(),
+             std::string("Destination Memory Types to sweep"));
+    PRINT_ES("SWEEP_EXE", sweepExe.c_str(),
+             std::string("Executor Types to sweep"));
+    PRINT_EV("SWEEP_MAX", sweepMax,
+             std::string("Max simultaneous transfers (0 = no limit)"));
+    PRINT_EV("SWEEP_MIN", sweepMin,
+             std::string("Min simultaenous transfers"));
+    PRINT_EV("SWEEP_RAND_BYTES", sweepRandBytes,
+             std::string("Using ") + (sweepRandBytes ? "random" : "constant") + " number of bytes per Transfer");
+    PRINT_EV("SWEEP_SEED", sweepSeed,
+             std::string("Random seed set to ") + std::to_string(sweepSeed));
+    PRINT_ES("SWEEP_SRC", sweepSrc.c_str(),
+             std::string("Source Memory Types to sweep"));
+    PRINT_EV("SWEEP_TEST_LIMIT", sweepTestLimit,
+             std::string("Max number of tests to run during sweep (0 = no limit)"));
+    PRINT_EV("SWEEP_TIME_LIMIT", sweepTimeLimit,
+             std::string("Max number of seconds to run sweep for  (0 = no limit)"));
+    PRINT_EV("SWEEP_XGMI_MAX", sweepXgmiMax,
+             std::string("Max number of XGMI hops for Transfers (-1 = no limit)"));
+    PRINT_EV("SWEEP_XGMI_MIN", sweepXgmiMin,
+             std::string("Min number of XGMI hops for Transfers"));
+    printf("\n");
+  }
+
+  // Helper function that gets parses environment variable or sets to default value
+  static int GetEnvVar(std::string const& varname, int defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return atoi(getenv(varname.c_str()));
+    return defaultValue;
+  }
+
+  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return getenv(varname.c_str());
+    return defaultValue;
+  }
+};
+
+#endif
diff --git a/tfb.so/include/GetClosestNumaNode.hpp b/tfb.so/include/GetClosestNumaNode.hpp
new file mode 100644
index 00000000..266f1eb1
--- /dev/null
+++ b/tfb.so/include/GetClosestNumaNode.hpp
@@ -0,0 +1,149 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Helper macro for checking HSA calls
+#define HSA_CHECK(cmd)                                                  \
+  do {                                                                  \
+    hsa_status_t error = (cmd);                                         \
+    if (error != HSA_STATUS_SUCCESS) {                                  \
+      const char* errString = NULL;                                     \
+      hsa_status_string(error, &errString);                             \
+      std::cerr << "Encountered HSA error (" << errString << ") at line " \
+                << __LINE__ << " in file " << __FILE__ << "\n";         \
+      exit(-1);                                                         \
+    }                                                                   \
+  } while (0)
+
+// Structure to hold HSA agent information
+#if !defined(__NVCC__)
+struct AgentData
+{
+  bool isInitialized;
+  std::vector<hsa_agent_t> cpuAgents;
+  std::vector<hsa_agent_t> gpuAgents;
+  std::vector<int> closestNumaNode;
+};
+
+// Simple callback function to return any memory pool for an agent
+hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)
+{
+  hsa_amd_memory_pool_t* poolData = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
+
+  // Check memory pool flags
+  uint32_t poolFlags;
+  HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags));
+
+  // Only consider coarse-grained pools
+  if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS;
+
+  *poolData = pool;
+  return HSA_STATUS_SUCCESS;
+}
+
+// Callback function to gather HSA agent information
+hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data)
+{
+  AgentData* agentData = reinterpret_cast<AgentData*>(data);
+
+  // Get the device type
+  hsa_device_type_t deviceType;
+  HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType));
+  if (deviceType == HSA_DEVICE_TYPE_CPU)
+    agentData->cpuAgents.push_back(agent);
+  if (deviceType == HSA_DEVICE_TYPE_GPU)
+  {
+    agentData->gpuAgents.push_back(agent);
+    agentData->closestNumaNode.push_back(0);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+AgentData& GetAgentData()
+{
+  static AgentData agentData = {};
+
+  if (!agentData.isInitialized)
+  {
+    agentData.isInitialized = true;
+
+    // Add all detected agents to the list
+    HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
+
+    // Loop over each GPU
+    for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++)
+    {
+      // Collect memory pool
+      hsa_amd_memory_pool_t pool;
+      HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
+
+      // Loop over each CPU agent and check distance
+      int bestDistance = -1;
+      for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++)
+      {
+        // Determine number of hops from GPU memory pool to CPU agent
+        uint32_t hops = 0;
+        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
+                                                     pool,
+                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
+                                                     &hops));
+        // Gather link info
+        hsa_amd_memory_pool_link_info_t* link_info =
+          (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
+        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
+                                                     pool,
+                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
+                                                     link_info));
+        int numaDist = 0;
+        for (int k = 0; k < hops; k++)
+        {
+          numaDist += link_info[k].numa_distance;
+        }
+        if (bestDistance == -1 || numaDist < bestDistance)
+        {
+          agentData.closestNumaNode[i] = j;
+          bestDistance = numaDist;
+        }
+        free(link_info);
+      }
+    }
+  }
+  return agentData;
+}
+#endif
+
+// Returns closest CPU NUMA node to provided GPU
+// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing
+int GetClosestNumaNode(int gpuIdx)
+{
+#if defined(__NVCC__)
+  return -1;
+#else
+  AgentData& agentData = GetAgentData();
+  if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size())
+  {
+    printf("[ERROR] GPU index out is out of bounds\n");
+    exit(1);
+  }
+  return agentData.closestNumaNode[gpuIdx];
+#endif
+}
diff --git a/tfb.so/include/Kernels.hpp b/tfb.so/include/Kernels.hpp
new file mode 100644
index 00000000..dcb6bf07
--- /dev/null
+++ b/tfb.so/include/Kernels.hpp
@@ -0,0 +1,409 @@
+/*
+Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#define PackedFloat_t   float4
+#define WARP_SIZE       64
+#define BLOCKSIZE       256
+#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
+#define MEMSET_CHAR     75
+#define MEMSET_VAL      13323083.0f
+
+// Each subExecutor is provided with subarrays to work on
+#define MAX_SRCS 16
+#define MAX_DSTS 16
+struct SubExecParam
+{
+  size_t    N;                                  // Number of floats this subExecutor works on
+  int       numSrcs;                            // Number of source arrays
+  int       numDsts;                            // Number of destination arrays
+  float*    src[MAX_SRCS];                      // Source array pointers
+  float*    dst[MAX_DSTS];                      // Destination array pointers
+  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
+  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
+};
+
+void CpuReduceKernel(SubExecParam const& p)
+{
+  int const& numSrcs = p.numSrcs;
+  int const& numDsts = p.numDsts;
+
+  if (numSrcs == 0)
+  {
+    for (int i = 0; i < numDsts; ++i)
+      memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
+  }
+  else if (numSrcs == 1)
+  {
+    float const* __restrict__ src = p.src[0];
+    for (int i = 0; i < numDsts; ++i)
+    {
+      memcpy(p.dst[i], src, p.N * sizeof(float));
+    }
+  }
+  else
+  {
+    for (int j = 0; j < p.N; j++)
+    {
+      float sum = p.src[0][j];
+      for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
+      for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
+    }
+  }
+}
+
+std::string PrepSrcValueString()
+{
+  return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
+}
+
+__host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
+{
+  return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
+}
+
+// GPU kernel to prepare src buffer data
+__global__ void
+PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
+{
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < N;
+       idx += blockDim.x * gridDim.x)
+  {
+    ptr[idx] = PrepSrcValue(srcBufferIdx, idx);
+  }
+}
+
+// Helper function for memset
+template <typename T> __device__ __forceinline__ T      MemsetVal();
+template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
+template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
+
+// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
+template <int LOOP1_UNROLL>
+__global__ void __launch_bounds__(BLOCKSIZE)
+GpuReduceKernel(SubExecParam* params)
+{
+  int64_t startCycle = wall_clock64();
+
+  // Operate on wavefront granularity
+  SubExecParam& p    = params[blockIdx.x];
+  int const numSrcs  = p.numSrcs;
+  int const numDsts  = p.numDsts;
+  int const waveId   = threadIdx.x / WARP_SIZE; // Wavefront number
+  int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
+
+  // 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration
+  // Determine the number of packed floats processed by the first loop
+  size_t       Nrem        = p.N;
+  size_t const loop1Npack  = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
+  size_t const loop1Nelem  = loop1Npack * FLOATS_PER_PACK;
+  size_t const loop1Inc    = BLOCKSIZE * LOOP1_UNROLL;
+  size_t       loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
+
+  while (loop1Offset < loop1Npack)
+  {
+    PackedFloat_t vals[LOOP1_UNROLL] = {};
+
+    if (numSrcs == 0)
+    {
+      #pragma unroll
+      for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal<float4>();
+    }
+    else
+    {
+      for (int i = 0; i < numSrcs; ++i)
+      {
+        PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset;
+        #pragma unroll
+        for (int u = 0; u < LOOP1_UNROLL; ++u)
+          vals[u] += *(packedSrc + u * WARP_SIZE);
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset;
+      #pragma unroll
+      for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u];
+    }
+    loop1Offset += loop1Inc;
+  }
+  Nrem -= loop1Nelem;
+
+  if (Nrem > 0)
+  {
+    // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
+    // NOTE: Using int32_t due to smaller size requirements
+    int32_t const loop2Npack  = Nrem / FLOATS_PER_PACK;
+    int32_t const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
+    int32_t const loop2Inc    = BLOCKSIZE;
+    int32_t       loop2Offset = threadIdx.x;
+
+    while (loop2Offset < loop2Npack)
+    {
+      PackedFloat_t val;
+      if (numSrcs == 0)
+      {
+        val = MemsetVal<float4>();
+      }
+      else
+      {
+        val = {};
+        for (int i = 0; i < numSrcs; ++i)
+        {
+          PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset;
+          val += *packedSrc;
+        }
+      }
+
+      for (int i = 0; i < numDsts; ++i)
+      {
+        PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset;
+        *packedDst = val;
+      }
+      loop2Offset += loop2Inc;
+    }
+    Nrem -= loop2Nelem;
+
+    // Deal with leftovers less than FLOATS_PER_PACK)
+    if (threadIdx.x < Nrem)
+    {
+      int offset = loop1Nelem + loop2Nelem + threadIdx.x;
+      float val = 0;
+      if (numSrcs == 0)
+      {
+        val = MEMSET_VAL;
+      }
+      else
+      {
+        for (int i = 0; i < numSrcs; ++i)
+          val += p.src[i][offset];
+      }
+
+      for (int i = 0; i < numDsts; ++i)
+        p.dst[i][offset] = val;
+    }
+  }
+
+  __syncthreads();
+  if (threadIdx.x == 0)
+  {
+    p.startCycle = startCycle;
+    p.stopCycle  = wall_clock64();
+  }
+}
+
+template <typename FLOAT_TYPE, int UNROLL_FACTOR>
+__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
+{
+  int    constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
+  size_t constexpr loopPackInc      = BLOCKSIZE * UNROLL_FACTOR;
+  size_t constexpr numPacksPerWave  = WARP_SIZE * UNROLL_FACTOR;
+  int    const     waveId           = threadIdx.x / WARP_SIZE;            // Wavefront number
+  int    const     threadId         = threadIdx.x % WARP_SIZE;            // Thread index within wavefront
+  int    const     numSrcs          = p.numSrcs;
+  int    const     numDsts          = p.numDsts;
+  size_t const     numPacksDone     = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave;
+  size_t const     numFloatsLeft    = N - numPacksDone * numFloatsPerPack;
+  size_t           loopPackOffset   = waveId * numPacksPerWave + threadId;
+
+  while (loopPackOffset < numPacksDone)
+  {
+    FLOAT_TYPE vals[UNROLL_FACTOR];
+
+    if (numSrcs == 0)
+    {
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal<FLOAT_TYPE>();
+    }
+    else
+    {
+      FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset;
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        vals[u] = *(src0Ptr + u * WARP_SIZE);
+
+      for (int i = 1; i < numSrcs; ++i)
+      {
+        FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset;
+
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          vals[u] += *(srcPtr + u * WARP_SIZE);
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset;
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        *(dstPtr + u * WARP_SIZE) = vals[u];
+    }
+    loopPackOffset += loopPackInc;
+  }
+
+  return numFloatsLeft;
+}
+
+template <typename FLOAT_TYPE, int UNROLL_FACTOR>
+__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N)
+{
+  // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
+  int    constexpr numFloatsPerRead      = sizeof(FLOAT_TYPE) / sizeof(float);
+  size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead;
+  size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
+  size_t const     numFloatsLeft         = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
+  size_t const     numFloatsDone         = N - numFloatsLeft;
+  int    const     numSrcs               = p.numSrcs;
+  int    const     numDsts               = p.numDsts;
+
+  for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop)
+  {
+    FLOAT_TYPE tmp[UNROLL_FACTOR];
+
+    if (numSrcs == 0)
+    {
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          tmp[u] = MemsetVal<FLOAT_TYPE>();
+    }
+    else
+    {
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop]));
+
+      for (int i = 1; i < numSrcs; ++i)
+      {
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop]));
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+      {
+        *((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u];
+      }
+    }
+  }
+  return numFloatsLeft;
+}
+
+template <typename FLOAT_TYPE>
+__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
+{
+  switch (unroll)
+  {
+  case  1: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
+  case  2: return GpuReduceFuncImpl<FLOAT_TYPE,  2>(p, offset, N);
+  case  3: return GpuReduceFuncImpl<FLOAT_TYPE,  3>(p, offset, N);
+  case  4: return GpuReduceFuncImpl<FLOAT_TYPE,  4>(p, offset, N);
+  case  5: return GpuReduceFuncImpl<FLOAT_TYPE,  5>(p, offset, N);
+  case  6: return GpuReduceFuncImpl<FLOAT_TYPE,  6>(p, offset, N);
+  case  7: return GpuReduceFuncImpl<FLOAT_TYPE,  7>(p, offset, N);
+  case  8: return GpuReduceFuncImpl<FLOAT_TYPE,  8>(p, offset, N);
+  case  9: return GpuReduceFuncImpl<FLOAT_TYPE,  9>(p, offset, N);
+  case 10: return GpuReduceFuncImpl<FLOAT_TYPE, 10>(p, offset, N);
+  case 11: return GpuReduceFuncImpl<FLOAT_TYPE, 11>(p, offset, N);
+  case 12: return GpuReduceFuncImpl<FLOAT_TYPE, 12>(p, offset, N);
+  case 13: return GpuReduceFuncImpl<FLOAT_TYPE, 13>(p, offset, N);
+  case 14: return GpuReduceFuncImpl<FLOAT_TYPE, 14>(p, offset, N);
+  case 15: return GpuReduceFuncImpl<FLOAT_TYPE, 15>(p, offset, N);
+  case 16: return GpuReduceFuncImpl<FLOAT_TYPE, 16>(p, offset, N);
+  default: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
+  }
+}
+
+// GPU copy kernel
+__global__ void __launch_bounds__(BLOCKSIZE)
+GpuReduceKernel2(SubExecParam* params)
+{
+  int64_t startCycle = wall_clock64();
+  SubExecParam& p = params[blockIdx.x];
+
+  size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
+  if (numFloatsLeft)
+    numFloatsLeft = GpuReduceFunc<float4>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
+
+  if (numFloatsLeft)
+  GpuReduceFunc<float>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
+
+  __threadfence_system();
+  if (threadIdx.x == 0)
+  {
+    p.startCycle = startCycle;
+    p.stopCycle  = wall_clock64();
+  }
+}
+
+#define NUM_GPU_KERNELS 18
+typedef void (*GpuKernelFuncPtr)(SubExecParam*);
+
+GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] =
+{
+  GpuReduceKernel<8>,
+  GpuReduceKernel<1>,
+  GpuReduceKernel<2>,
+  GpuReduceKernel<3>,
+  GpuReduceKernel<4>,
+  GpuReduceKernel<5>,
+  GpuReduceKernel<6>,
+  GpuReduceKernel<7>,
+  GpuReduceKernel<8>,
+  GpuReduceKernel<9>,
+  GpuReduceKernel<10>,
+  GpuReduceKernel<11>,
+  GpuReduceKernel<12>,
+  GpuReduceKernel<13>,
+  GpuReduceKernel<14>,
+  GpuReduceKernel<15>,
+  GpuReduceKernel<16>,
+  GpuReduceKernel2
+};
+
+std::string GpuKernelNames[NUM_GPU_KERNELS] =
+{
+  "Default - 8xUnroll",
+  "Unroll x1",
+  "Unroll x2",
+  "Unroll x3",
+  "Unroll x4",
+  "Unroll x5",
+  "Unroll x6",
+  "Unroll x7",
+  "Unroll x8",
+  "Unroll x9",
+  "Unroll x10",
+  "Unroll x11",
+  "Unroll x12",
+  "Unroll x13",
+  "Unroll x14",
+  "Unroll x15",
+  "Unroll x16",
+  "8xUnrollB",
+};
diff --git a/tfb.so/include/TransferBench.hpp b/tfb.so/include/TransferBench.hpp
new file mode 100644
index 00000000..6abbbc5b
--- /dev/null
+++ b/tfb.so/include/TransferBench.hpp
@@ -0,0 +1,198 @@
+/*
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <vector>
+#include <sstream>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstdint>
+#include <set>
+#include <unistd.h>
+#include <map>
+#include <iostream>
+#include <sstream>
+
+#include "Compatibility.hpp"
+
+// Helper macro for catching HIP errors
+#define HIP_CALL(cmd)                                                                   \
+    do {                                                                                \
+        hipError_t error = (cmd);                                                       \
+        if (error != hipSuccess)                                                        \
+        {                                                                               \
+            std::cerr << "Encountered HIP error (" << hipGetErrorString(error)          \
+                      << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
+            exit(-1);                                                                   \
+        }                                                                               \
+    } while (0)
+
+#include "EnvVars.hpp"
+
+// Simple configuration parameters
+size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer
+
+// Different src/dst memory types supported
+typedef enum
+{
+  MEM_CPU          = 0, // Coarse-grained pinned CPU memory
+  MEM_GPU          = 1, // Coarse-grained global GPU memory
+  MEM_CPU_FINE     = 2, // Fine-grained pinned CPU memory
+  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
+  MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
+  MEM_NULL         = 5, // NULL memory - used for empty
+} MemType;
+
+typedef enum
+{
+  EXE_CPU          = 0, // CPU executor              (subExecutor = CPU thread)
+  EXE_GPU_GFX      = 1, // GPU kernel-based executor (subExecutor = threadblock/CU)
+  EXE_GPU_DMA      = 2, // GPU SDMA-based executor   (subExecutor = streams)
+} ExeType;
+
+bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); }
+bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
+bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
+bool IsCpuType(ExeType e) { return (e == EXE_CPU); };
+
+char const MemTypeStr[7] = "CGBFUN";
+char const ExeTypeStr[4] = "CGD";
+char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
+
+MemType inline CharToMemType(char const c)
+{
+  char const* val = strchr(MemTypeStr, toupper(c));
+  if (*val) return (MemType)(val - MemTypeStr);
+  printf("[ERROR] Unexpected memory type (%c)\n", c);
+  exit(1);
+}
+
+ExeType inline CharToExeType(char const c)
+{
+  char const* val = strchr(ExeTypeStr, toupper(c));
+  if (*val) return (ExeType)(val - ExeTypeStr);
+  printf("[ERROR] Unexpected executor type (%c)\n", c);
+  exit(1);
+}
+
+// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified)
+// then writes the summation to each of the specified destination memory location(s)
+struct Transfer
+{
+  int                       transferIndex;      // Transfer identifier (within a Test)
+  ExeType                   exeType;            // Transfer executor type
+  int                       exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
+  int                       numSubExecs;        // Number of subExecutors to use for this Transfer
+  size_t                    numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
+  size_t                    numBytesActual;     // Actual number of bytes to copy
+  double                    transferTime;       // Time taken in milliseconds
+
+  int                       numSrcs;            // Number of sources
+  std::vector<MemType>      srcType;            // Source memory types
+  std::vector<int>          srcIndex;           // Source device indice
+  std::vector<float*>       srcMem;             // Source memory
+
+  int                       numDsts;            // Number of destinations
+  std::vector<MemType>      dstType;            // Destination memory type
+  std::vector<int>          dstIndex;           // Destination device index
+  std::vector<float*>       dstMem;             // Destination memory
+
+  std::vector<SubExecParam> subExecParam;       // Defines subarrays assigned to each threadblock
+  SubExecParam*             subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
+
+  // Prepares src/dst subarray pointers for each SubExecutor
+  void PrepareSubExecParams(EnvVars const& ev);
+
+  // Prepare source arrays with input data
+  bool PrepareSrc(EnvVars const& ev);
+
+  // Validate that destination data contains expected results
+  void ValidateDst(EnvVars const& ev);
+
+  // Prepare reference buffers
+  void PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx);
+
+  // String representation functions
+  std::string SrcToStr() const;
+  std::string DstToStr() const;
+};
+
+struct ExecutorInfo
+{
+  std::vector<Transfer*>   transfers;        // Transfers to execute
+  size_t                   totalBytes;       // Total bytes this executor transfers
+  int                      totalSubExecs;    // Total number of subExecutors to use
+
+  // For GPU-Executors
+  SubExecParam*            subExecParamGpu;  // GPU copy of subExecutor parameters
+  std::vector<hipStream_t> streams;
+  std::vector<hipEvent_t>  startEvents;
+  std::vector<hipEvent_t>  stopEvents;
+
+  // Results
+  double totalTime;
+};
+
+typedef std::pair<ExeType, int> Executor;
+typedef std::map<Executor, ExecutorInfo> TransferMap;
+
+// Display usage instructions
+void DisplayUsage(char const* cmdName);
+
+// Display detected GPU topology / CPU numa nodes
+void DisplayTopology(bool const outputToCsv);
+
+// Build array of test sizes based on sampling factor
+void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor,
+                       std::vector<size_t>& valuesofN);
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
+                  std::vector<MemType>& memType, std::vector<int>& memIndex);
+void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
+                  ExeType& exeType, int& exeIndex);
+
+void ParseTransfers(char* line, int numCpus, int numGpus,
+                    std::vector<Transfer>& transfers);
+
+void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
+                      std::vector<Transfer>& transfers, bool verbose = true);
+
+void EnablePeerAccess(int const deviceId, int const peerDeviceId);
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
+void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
+void CheckPages(char* byteArray, size_t numBytes, int targetId);
+void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
+
+// Return the maximum bandwidth measured for given (src/dst) pair
+double GetPeakBandwidth(EnvVars const& ev, size_t  const  N,
+                        int     const  isBidirectional,
+                        MemType const  srcType, int const srcIndex,
+                        MemType const  dstType, int const dstIndex);
+
+std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
+
+int RemappedIndex(int const origIdx, bool const isCpuType);
+int GetWallClockRate(int deviceId);
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
+std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);
diff --git a/tfb.so/include/action.h b/tfb.so/include/action.h
new file mode 100644
index 00000000..7fdf2b35
--- /dev/null
+++ b/tfb.so/include/action.h
@@ -0,0 +1,86 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTEST_SO_INCLUDE_ACTION_H_
+#define HIPTEST_SO_INCLUDE_ACTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <pci/pci.h>
+#ifdef __cplusplus
+}
+#endif
+
+#include <vector>
+#include <string>
+#include <map>
+
+#include "include/rvsactionbase.h"
+
+using std::vector;
+using std::string;
+using std::map;
+
+/**
+ * @class tfb_action
+ * @ingroup GST
+ *
+ * @brief GST action implementation class
+ *
+ * Derives from rvs::actionbase and implements actual action functionality
+ * in its run() method.
+ *
+ */
+class tfb_action: public rvs::actionbase {
+ public:
+    tfb_action();
+    virtual ~tfb_action();
+
+    virtual int run(void);
+
+    std::string m_transfer_type;
+
+ protected:
+    //! TRUE if JSON output is required
+    bool bjson;
+
+    bool get_all_tfb_config_keys(void);
+  /**
+  * @brief reads all common configuration keys from
+  * the module's properties collection
+  * @return true if no fatal error occured, false otherwise
+  */
+    bool get_all_common_config_keys(void);
+
+    int get_num_amd_gpu_devices(void);
+    bool start_tfb_runners();
+  /**
+  * @brief gets the number of ROCm compatible AMD GPUs
+  * @return run number of GPUs
+  */
+    int run_transferbench(void);
+};
+
+#endif  // HIPTEST_SO_INCLUDE_ACTION_H_
diff --git a/tfb.so/include/rvs_module.h b/tfb.so/include/rvs_module.h
new file mode 100644
index 00000000..9edaec65
--- /dev/null
+++ b/tfb.so/include/rvs_module.h
@@ -0,0 +1,31 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTEST_SO_INCLUDE_RVS_MODULE_H_
+#define HIPTEST_SO_INCLUDE_RVS_MODULE_H_
+
+#include "include/rvsliblog.h"
+
+
+#endif  // HIPTEST_SO_INCLUDE_RVS_MODULE_H_
diff --git a/tfb.so/include/tfb_worker.h b/tfb.so/include/tfb_worker.h
new file mode 100644
index 00000000..10b1cfac
--- /dev/null
+++ b/tfb.so/include/tfb_worker.h
@@ -0,0 +1,72 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
+#define HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
+
+#include <string>
+#include <memory>
+#include "include/rvsthreadbase.h"
+#include "include/rvs_blas.h"
+#include "include/rvs_util.h"
+
+#define GST_RESULT_PASS_MESSAGE         "true"
+#define GST_RESULT_FAIL_MESSAGE         "false"
+
+
+/**
+ * @class tfbWorker
+ * @ingroup GST
+ *
+ * @brief tfbWorker action implementation class
+ *
+ * Derives from rvs::ThreadBase and implements actual action functionality
+ * in its run() method.
+ *
+ */
+class tfbWorker : public rvs::ThreadBase {
+ public:
+    tfbWorker();
+    virtual ~tfbWorker();
+
+    //! sets action name
+    void set_name(const std::string& name) { action_name = name; }
+    //! returns action name
+    const std::string& get_name(void) { return action_name; }
+
+    //! sets test path
+    void set_transfer_type(std::string type) { m_tfr_type = type; }
+
+    const std::string& get_transfer_type(void) { return m_tfr_type; }
+    int TfbRun(int &error, std::string &errdesc);
+ protected:
+    virtual void run(void);
+ protected:
+    //! name of the action
+    std::string action_name;
+    //! path to execute test
+    std::string m_tfr_type;
+};
+
+#endif  // HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
diff --git a/tfb.so/src/.gitignore b/tfb.so/src/.gitignore
new file mode 100644
index 00000000..6677c873
--- /dev/null
+++ b/tfb.so/src/.gitignore
@@ -0,0 +1 @@
+/libmain.cpp
diff --git a/tfb.so/src/TransferBench.cpp b/tfb.so/src/TransferBench.cpp
new file mode 100644
index 00000000..51c7f68f
--- /dev/null
+++ b/tfb.so/src/TransferBench.cpp
@@ -0,0 +1,1814 @@
+/*
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// This program measures simultaneous copy performance across multiple GPUs
+// on the same node
+#include <numa.h>
+#include <numaif.h>
+#include <random>
+#include <stack>
+#include <thread>
+
+#include "TransferBench.hpp"
+#include "GetClosestNumaNode.hpp"
+
+
+
+void ExecuteTransfers(EnvVars const& ev,
+                      int const testNum,
+                      size_t const N,
+                      std::vector<Transfer>& transfers,
+                      bool verbose)
+{
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  // Map transfers by executor
+  TransferMap transferMap;
+  for (Transfer& transfer : transfers)
+  {
+    Executor executor(transfer.exeType, transfer.exeIndex);
+    ExecutorInfo& executorInfo = transferMap[executor];
+    executorInfo.transfers.push_back(&transfer);
+  }
+
+  // Loop over each executor and prepare sub-executors
+  std::map<int, Transfer*> transferList;
+  for (auto& exeInfoPair : transferMap)
+  {
+    Executor const& executor = exeInfoPair.first;
+    ExecutorInfo& exeInfo    = exeInfoPair.second;
+    ExeType const exeType    = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
+
+    exeInfo.totalTime = 0.0;
+    exeInfo.totalSubExecs = 0;
+
+    // Loop over each transfer this executor is involved in
+    for (Transfer* transfer : exeInfo.transfers)
+    {
+      // Determine how many bytes to copy for this Transfer (use custom if pre-specified)
+      transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));
+
+      // Allocate source memory
+      transfer->srcMem.resize(transfer->numSrcs);
+      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
+      {
+        MemType const& srcType  = transfer->srcType[iSrc];
+        int     const  srcIndex    = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
+
+        // Ensure executing GPU can access source memory
+        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex)
+          EnablePeerAccess(exeIndex, srcIndex);
+
+        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
+      }
+
+      // Allocate destination memory
+      transfer->dstMem.resize(transfer->numDsts);
+      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
+      {
+        MemType const& dstType  = transfer->dstType[iDst];
+        int     const  dstIndex    = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
+
+        // Ensure executing GPU can access destination memory
+        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex)
+          EnablePeerAccess(exeIndex, dstIndex);
+
+        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
+      }
+
+      exeInfo.totalSubExecs += transfer->numSubExecs;
+      transferList[transfer->transferIndex] = transfer;
+    }
+
+    // Prepare additional requirement for GPU-based executors
+    if (IsGpuType(exeType))
+    {
+      HIP_CALL(hipSetDevice(exeIndex));
+
+      // Single-stream is only supported for GFX-based executors
+      int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
+      exeInfo.streams.resize(numStreamsToUse);
+      exeInfo.startEvents.resize(numStreamsToUse);
+      exeInfo.stopEvents.resize(numStreamsToUse);
+      for (int i = 0; i < numStreamsToUse; ++i)
+      {
+        HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
+      }
+
+      if (exeType == EXE_GPU_GFX)
+      {
+        // Allocate one contiguous chunk of GPU memory for threadblock parameters
+        // This allows support for executing one transfer per stream, or all transfers in a single stream
+        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
+                       (void**)&exeInfo.subExecParamGpu);
+      }
+    }
+  }
+
+  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
+
+  // Prepare input memory and block parameters for current N
+  bool isSrcCorrect = true;
+  for (auto& exeInfoPair : transferMap)
+  {
+    Executor const& executor = exeInfoPair.first;
+    ExecutorInfo& exeInfo    = exeInfoPair.second;
+    ExeType const exeType    = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
+
+    exeInfo.totalBytes = 0;
+
+    int transferOffset = 0;
+    for (int i = 0; i < exeInfo.transfers.size(); ++i)
+    {
+      // Prepare subarrays each threadblock works on and fill src memory with patterned data
+      Transfer* transfer = exeInfo.transfers[i];
+      transfer->PrepareSubExecParams(ev);
+      isSrcCorrect &= transfer->PrepareSrc(ev);
+      exeInfo.totalBytes += transfer->numBytesActual;
+
+      // Copy block parameters to GPU for GPU executors
+      if (transfer->exeType == EXE_GPU_GFX)
+      {
+        exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
+        HIP_CALL(hipSetDevice(exeIndex));
+        HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset],
+                           transfer->subExecParam.data(),
+                           transfer->subExecParam.size() * sizeof(SubExecParam),
+                           hipMemcpyHostToDevice));
+        HIP_CALL(hipDeviceSynchronize());
+
+        transferOffset += transfer->subExecParam.size();
+      }
+    }
+  }
+
+  // Launch kernels (warmup iterations are not counted)
+  double totalCpuTime = 0;
+  size_t numTimedIterations = 0;
+  std::stack<std::thread> threads;
+  for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
+  {
+    if (ev.numIterations > 0 && iteration    >= ev.numIterations) break;
+    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
+
+    // Pause before starting first timed iteration in interactive mode
+    if (verbose && ev.useInteractive && iteration == 0)
+    {
+      printf("Memory prepared:\n");
+
+      for (Transfer& transfer : transfers)
+      {
+        printf("Transfer %03d:\n", transfer.transferIndex);
+        for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
+          printf("  SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
+        for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
+          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
+      }
+      printf("Hit <Enter> to continue: ");
+      if (scanf("%*c") != 0)
+      {
+        printf("[ERROR] Unexpected input\n");
+        exit(1);
+      }
+      printf("\n");
+    }
+
+    // Start CPU timing for this iteration
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Execute all Transfers in parallel
+    for (auto& exeInfoPair : transferMap)
+    {
+      ExecutorInfo& exeInfo = exeInfoPair.second;
+      ExeType       exeType = exeInfoPair.first.first;
+      int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();
+
+      for (int i = 0; i < numTransfersToRun; ++i)
+        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
+    }
+
+    // Wait for all threads to finish
+    int const numTransfers = threads.size();
+    for (int i = 0; i < numTransfers; i++)
+    {
+      threads.top().join();
+      threads.pop();
+    }
+
+    // Stop CPU timing for this iteration
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+
+    if (iteration >= 0)
+    {
+      ++numTimedIterations;
+      totalCpuTime += deltaSec;
+    }
+  }
+
+  // Pause for interactive mode
+  if (verbose && isSrcCorrect && ev.useInteractive)
+  {
+    printf("Transfers complete. Hit <Enter> to continue: ");
+    if (scanf("%*c") != 0)
+    {
+      printf("[ERROR] Unexpected input\n");
+      exit(1);
+    }
+    printf("\n");
+  }
+
+  // Validate that each transfer has transferred correctly
+  size_t totalBytesTransferred = 0;
+  int const numTransfers = transferList.size();
+  for (auto transferPair : transferList)
+  {
+    Transfer* transfer = transferPair.second;
+    transfer->ValidateDst(ev);
+    totalBytesTransferred += transfer->numBytesActual;
+  }
+
+  // Report timings
+  totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
+  double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
+  double maxGpuTime = 0;
+
+  if (!isSrcCorrect) goto cleanup;
+  if (ev.useSingleStream)
+  {
+    for (auto& exeInfoPair : transferMap)
+    {
+      ExecutorInfo  exeInfo  = exeInfoPair.second;
+      ExeType const exeType  = exeInfoPair.first.first;
+      int     const exeIndex = exeInfoPair.first.second;
+
+      // Compute total time for non GPU executors
+      if (exeType != EXE_GPU_GFX)
+      {
+        exeInfo.totalTime = 0;
+        for (auto const& transfer : exeInfo.transfers)
+          exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
+      }
+
+      double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
+      double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
+
+      if (verbose && !ev.outputToCsv)
+      {
+        printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n",
+               ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
+      }
+
+      int totalCUs = 0;
+      for (auto const& transfer : exeInfo.transfers)
+      {
+        double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+        double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+        totalCUs += transfer->numSubExecs;
+
+        if (!verbose) continue;
+        if (!ev.outputToCsv)
+        {
+          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+                 transfer->transferIndex,
+                 transferBandwidthGbs,
+                 transferDurationMsec,
+                 transfer->numBytesActual,
+                 transfer->SrcToStr().c_str(),
+                 ExeTypeName[transfer->exeType], transfer->exeIndex,
+                 transfer->numSubExecs,
+                 transfer->DstToStr().c_str());
+        }
+        else
+        {
+          printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+                 testNum, transfer->transferIndex, transfer->numBytesActual,
+                 transfer->SrcToStr().c_str(),
+                 MemTypeStr[transfer->exeType], transfer->exeIndex,
+                 transfer->DstToStr().c_str(),
+                 transfer->numSubExecs,
+                 transferBandwidthGbs, transferDurationMsec,
+                 PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
+                 PtrVectorToStr(transfer->dstMem, initOffset).c_str());
+        }
+      }
+
+      if (verbose && ev.outputToCsv)
+      {
+        printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n",
+               testNum, totalBytesTransferred,
+               MemTypeStr[exeType], exeIndex, totalCUs,
+               exeBandwidthGbs, exeDurationMsec);
+      }
+    }
+  }
+  else
+  {
+    for (auto const& transferPair : transferList)
+    {
+      Transfer* transfer = transferPair.second;
+      double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+      double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
+      if (!verbose) continue;
+      if (!ev.outputToCsv)
+      {
+        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+               transfer->transferIndex,
+               transferBandwidthGbs, transferDurationMsec,
+               transfer->numBytesActual,
+               transfer->SrcToStr().c_str(),
+               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               transfer->numSubExecs,
+               transfer->DstToStr().c_str());
+      }
+      else
+      {
+        printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+               testNum, transfer->transferIndex, transfer->numBytesActual,
+               transfer->SrcToStr().c_str(),
+               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               transfer->DstToStr().c_str(),
+               transfer->numSubExecs,
+               transferBandwidthGbs, transferDurationMsec,
+               PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
+               PtrVectorToStr(transfer->dstMem, initOffset).c_str());
+      }
+    }
+  }
+
+  // Display aggregate statistics
+  if (verbose)
+  {
+    if (!ev.outputToCsv)
+    {
+      printf(" Aggregate (CPU)  | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
+             totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
+    }
+    else
+    {
+      printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n",
+             testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
+    }
+  }
+
+  // Release GPU memory
+cleanup:
+  for (auto exeInfoPair : transferMap)
+  {
+    ExecutorInfo& exeInfo  = exeInfoPair.second;
+    ExeType const exeType  = exeInfoPair.first.first;
+    int     const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));
+
+    for (auto& transfer : exeInfo.transfers)
+    {
+      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
+      {
+        MemType const& srcType = transfer->srcType[iSrc];
+        DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
+      }
+      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
+      {
+        MemType const& dstType = transfer->dstType[iDst];
+        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
+      }
+      transfer->subExecParam.clear();
+    }
+
+    if (IsGpuType(exeType))
+    {
+      int const numStreams = (int)exeInfo.streams.size();
+      for (int i = 0; i < numStreams; ++i)
+      {
+        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
+        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
+        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
+      }
+
+      if (exeType == EXE_GPU_GFX)
+      {
+        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
+      }
+    }
+  }
+}
+
+void DisplayUsage(char const* cmdName)
+{
+  printf("TransferBench v%s\n", TB_VERSION);
+  printf("========================================\n");
+
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+  int const numCpuDevices = numa_num_configured_nodes();
+
+  printf("Usage: %s config <N>\n", cmdName);
+  printf("  config: Either:\n");
+  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
+  printf("          - Name of preset config:\n");
+  printf("              p2p          - Peer-to-peer benchmark tests\n");
+  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
+  printf("                             - 3rd/4th optional args for # GPU SubExecs / # CPU SubExecs per Transfer\n");
+  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
+  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
+         DEFAULT_BYTES_PER_TRANSFER);
+  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
+  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
+  printf("\n");
+
+  EnvVars::DisplayUsage();
+}
+
+int RemappedIndex(int const origIdx, bool const isCpuType)
+{
+  static std::vector<int> remappingCpu;
+  static std::vector<int> remappingGpu;
+
+  // Build CPU remapping on first use
+  // Skip numa nodes that are not configured
+  if (remappingCpu.empty())
+  {
+    for (int node = 0; node <= numa_max_node(); node++)
+      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
+        remappingCpu.push_back(node);
+  }
+
+  // Build remappingGpu on first use
+  if (remappingGpu.empty())
+  {
+    int numGpuDevices;
+    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+    remappingGpu.resize(numGpuDevices);
+
+    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
+    if (!usePcieIndexing)
+    {
+      // For HIP-based indexing no remappingGpu is necessary
+      for (int i = 0; i < numGpuDevices; ++i)
+        remappingGpu[i] = i;
+    }
+    else
+    {
+      // Collect PCIe address for each GPU
+      std::vector<std::pair<std::string, int>> mapping;
+      char pciBusId[20];
+      for (int i = 0; i < numGpuDevices; ++i)
+      {
+        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
+        mapping.push_back(std::make_pair(pciBusId, i));
+      }
+      // Sort GPUs by PCIe address then use that as mapping
+      std::sort(mapping.begin(), mapping.end());
+      for (int i = 0; i < numGpuDevices; ++i)
+        remappingGpu[i] = mapping[i].second;
+    }
+  }
+  return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
+}
+
+void DisplayTopology(bool const outputToCsv)
+{
+
+  int numCpuDevices = numa_num_configured_nodes();
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+
+  if (outputToCsv)
+  {
+    printf("NumCpus,%d\n", numCpuDevices);
+    printf("NumGpus,%d\n", numGpuDevices);
+  }
+  else
+  {
+    printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total]   %d GPU device(s)\n",
+           numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
+  }
+
+  // Print out detected CPU topology
+  if (outputToCsv)
+  {
+    printf("NUMA");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf(",NUMA%02d", j);
+    printf(",# CPUs,ClosestGPUs,ActualNode\n");
+  }
+  else
+  {
+    printf("            |");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf("NUMA %02d|", j);
+    printf(" #Cpus | Closest GPU(s)\n");
+
+    printf("------------+");
+    for (int j = 0; j <= numCpuDevices; j++)
+      printf("-------+");
+    printf("---------------\n");
+  }
+
+  for (int i = 0; i < numCpuDevices; i++)
+  {
+    int nodeI = RemappedIndex(i, true);
+    printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
+    for (int j = 0; j < numCpuDevices; j++)
+    {
+      int nodeJ = RemappedIndex(j, true);
+      int numaDist = numa_distance(nodeI, nodeJ);
+      if (outputToCsv)
+        printf("%d,", numaDist);
+      else
+        printf(" %5d |", numaDist);
+    }
+
+    int numCpus = 0;
+    for (int j = 0; j < numa_num_configured_cpus(); j++)
+      if (numa_node_of_cpu(j) == nodeI) numCpus++;
+    if (outputToCsv)
+      printf("%d,", numCpus);
+    else
+      printf(" %5d | ", numCpus);
+
+#if !defined(__NVCC__)
+    bool isFirst = true;
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
+      {
+        if (isFirst) isFirst = false;
+        else printf(",");
+        printf("%d", j);
+      }
+    }
+#endif
+    printf("\n");
+  }
+  printf("\n");
+
+#if defined(__NVCC__)
+  // No further topology detection done for NVIDIA platforms
+  return;
+#endif
+
+  // Print out detected GPU topology
+  if (outputToCsv)
+  {
+    printf("GPU");
+    for (int j = 0; j < numGpuDevices; j++)
+      printf(",GPU %02d", j);
+    printf(",PCIe Bus ID,ClosestNUMA\n");
+  }
+  else
+  {
+    printf("        |");
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, j));
+      std::string fullName = prop.gcnArchName;
+      std::string archName = fullName.substr(0, fullName.find(':'));
+      printf(" %6s |", archName.c_str());
+    }
+    printf("\n");
+    printf("        |");
+    for (int j = 0; j < numGpuDevices; j++)
+      printf(" GPU %02d |", j);
+    printf(" PCIe Bus ID  | #CUs | Closest NUMA\n");
+    for (int j = 0; j <= numGpuDevices; j++)
+      printf("--------+");
+    printf("--------------+------+-------------\n");
+  }
+
+#if !defined(__NVCC__)
+  char pciBusId[20];
+  for (int i = 0; i < numGpuDevices; i++)
+  {
+    int const deviceIdx = RemappedIndex(i, false);
+    printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (i == j)
+      {
+        if (outputToCsv)
+          printf("-,");
+        else
+          printf("    -   |");
+      }
+      else
+      {
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
+                                              RemappedIndex(j, false),
+                                              &linkType, &hopCount));
+        printf("%s%s-%d%s",
+               outputToCsv ? "" : " ",
+               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
+               hopCount, outputToCsv ? "," : " |");
+      }
+    }
+    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));
+
+    int numDeviceCUs = 0;
+    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
+
+    if (outputToCsv)
+      printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+    else
+      printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+  }
+#endif
+}
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
+                  std::vector<MemType>& memTypes, std::vector<int>& memIndices)
+{
+  char typeChar;
+  int offset = 0, devIndex, inc;
+  bool found = false;
+
+  memTypes.clear();
+  memIndices.clear();
+  while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
+  {
+    offset += inc;
+    MemType memType = CharToMemType(typeChar);
+
+    if (IsCpuType(memType) && (devIndex < 0 || devIndex >= numCpus))
+    {
+      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, devIndex);
+      exit(1);
+    }
+    if (IsGpuType(memType) && (devIndex < 0 || devIndex >= numGpus))
+    {
+      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, devIndex);
+      exit(1);
+    }
+
+    found = true;
+    if (memType != MEM_NULL)
+    {
+      memTypes.push_back(memType);
+      memIndices.push_back(devIndex);
+    }
+  }
+  if (!found)
+  {
+    printf("[ERROR] Unable to parse memory type token %s.  Expected one of %s followed by an index\n",
+           token.c_str(), MemTypeStr);
+    exit(1);
+  }
+}
+
+void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
+                  ExeType &exeType, int& exeIndex)
+{
+  char typeChar;
+  if (sscanf(token.c_str(), " %c%d", &typeChar, &exeIndex) != 2)
+  {
+    printf("[ERROR] Unable to parse valid executor token (%s).  Exepected one of %s followed by an index\n",
+           token.c_str(), ExeTypeStr);
+    exit(1);
+  }
+  exeType = CharToExeType(typeChar);
+
+  if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= numCpus))
+  {
+    printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, exeIndex);
+    exit(1);
+  }
+  if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= numGpus))
+  {
+    printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, exeIndex);
+    exit(1);
+  }
+}
+
+// Helper function to parse a list of Transfer definitions
+void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
+{
+  // Replace any round brackets or '->' with spaces,
+  for (int i = 1; line[i]; i++)
+    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
+
+  transfers.clear();
+
+  int numTransfers = 0;
+  std::istringstream iss(line);
+  iss >> numTransfers;
+  if (iss.fail()) return;
+
+  std::string exeMem;
+  std::string srcMem;
+  std::string dstMem;
+
+  // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
+  // otherwise read triples (srcMem, exeMem, dstMem)
+  bool const advancedMode = (numTransfers < 0);
+  numTransfers = abs(numTransfers);
+
+  int numSubExecs;
+  if (!advancedMode)
+  {
+    iss >> numSubExecs;
+    if (numSubExecs <= 0 || iss.fail())
+    {
+      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs);
+      exit(1);
+    }
+  }
+
+  size_t numBytes = 0;
+  for (int i = 0; i < numTransfers; i++)
+  {
+    Transfer transfer;
+    transfer.transferIndex = i;
+    transfer.numBytes = 0;
+    transfer.numBytesActual = 0;
+    if (!advancedMode)
+    {
+      iss >> srcMem >> exeMem >> dstMem;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
+        exit(1);
+      }
+    }
+    else
+    {
+      std::string numBytesToken;
+      iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
+        exit(1);
+      }
+      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
+      {
+        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
+        exit(1);
+      }
+      char units = numBytesToken.back();
+      switch (toupper(units))
+      {
+      case 'K': numBytes *= 1024; break;
+      case 'M': numBytes *= 1024*1024; break;
+      case 'G': numBytes *= 1024*1024*1024; break;
+      }
+    }
+
+    ParseMemType(srcMem, numCpus, numGpus, transfer.srcType, transfer.srcIndex);
+    ParseMemType(dstMem, numCpus, numGpus, transfer.dstType, transfer.dstIndex);
+    ParseExeType(exeMem, numCpus, numGpus, transfer.exeType, transfer.exeIndex);
+
+    transfer.numSrcs = (int)transfer.srcType.size();
+    transfer.numDsts = (int)transfer.dstType.size();
+    if (transfer.numSrcs == 0 && transfer.numDsts == 0)
+    {
+      printf("[ERROR] Transfer must have at least one src or dst\n");
+      exit(1);
+    }
+
+    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
+    {
+      printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
+      exit(1);
+    }
+
+    transfer.numSubExecs = numSubExecs;
+    transfer.numBytes = numBytes;
+    transfers.push_back(transfer);
+  }
+}
+
+void EnablePeerAccess(int const deviceId, int const peerDeviceId)
+{
+  int canAccess;
+  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
+  if (!canAccess)
+  {
+    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
+    exit(1);
+  }
+  HIP_CALL(hipSetDevice(deviceId));
+  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
+  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
+  {
+    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
+           deviceId, peerDeviceId, hipGetErrorString(error));
+    exit(1);
+  }
+}
+
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
+{
+  if (numBytes == 0)
+  {
+    printf("[ERROR] Unable to allocate 0 bytes\n");
+    exit(1);
+  }
+  *memPtr = nullptr;
+  if (IsCpuType(memType))
+  {
+    // Set numa policy prior to call to hipHostMalloc
+    numa_set_preferred(devIndex);
+
+    // Allocate host-pinned memory (should respect NUMA mem policy)
+    if (memType == MEM_CPU_FINE)
+    {
+#if defined (__NVCC__)
+      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
+      exit(1);
+#else
+      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+#endif
+    }
+    else if (memType == MEM_CPU)
+    {
+#if defined (__NVCC__)
+      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
+#else
+      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
+#endif
+      {
+        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
+        exit(1);
+      }
+    }
+    else if (memType == MEM_CPU_UNPINNED)
+    {
+      *memPtr = numa_alloc_onnode(numBytes, devIndex);
+    }
+
+    // Check that the allocated pages are actually on the correct NUMA node
+    memset(*memPtr, 0, numBytes);
+    CheckPages((char*)*memPtr, numBytes, devIndex);
+
+    // Reset to default numa mem policy
+    numa_set_preferred(-1);
+  }
+  else if (IsGpuType(memType))
+  {
+    if (memType == MEM_GPU)
+    {
+      // Allocate GPU memory on appropriate device
+      HIP_CALL(hipSetDevice(devIndex));
+      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
+    }
+    else if (memType == MEM_GPU_FINE)
+    {
+#if defined (__NVCC__)
+      printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
+      exit(1);
+#else
+      HIP_CALL(hipSetDevice(devIndex));
+      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
+
+#endif
+    }
+    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
+    HIP_CALL(hipDeviceSynchronize());
+  }
+  else
+  {
+    printf("[ERROR] Unsupported memory type %d\n", memType);
+    exit(1);
+  }
+}
+
+void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
+{
+  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null CPU pointer for %lu bytes.  Skipping hipHostFree\n", bytes);
+      return;
+    }
+    HIP_CALL(hipHostFree(memPtr));
+  }
+  else if (memType == MEM_CPU_UNPINNED)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes.  Skipping numa_free\n", bytes);
+      return;
+    }
+    numa_free(memPtr, bytes);
+  }
+  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
+      return;
+    }
+    HIP_CALL(hipFree(memPtr));
+  }
+}
+
+void CheckPages(char* array, size_t numBytes, int targetId)
+{
+  unsigned long const pageSize = getpagesize();
+  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
+
+  std::vector<void *> pages(numPages);
+  std::vector<int> status(numPages);
+
+  pages[0] = array;
+  for (int i = 1; i < numPages; i++)
+  {
+    pages[i] = (char*)pages[i-1] + pageSize;
+  }
+
+  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+  if (retCode)
+  {
+    printf("[ERROR] Unable to collect page info\n");
+    exit(1);
+  }
+
+  size_t mistakeCount = 0;
+  for (int i = 0; i < numPages; i++)
+  {
+    if (status[i] < 0)
+    {
+      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
+      exit(1);
+    }
+    if (status[i] != targetId) mistakeCount++;
+  }
+  if (mistakeCount > 0)
+  {
+    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
+    exit(1);
+  }
+}
+
+void RunTransfer(EnvVars const& ev, int const iteration,
+                 ExecutorInfo& exeInfo, int const transferIdx)
+{
+  Transfer* transfer = exeInfo.transfers[transferIdx];
+
+  if (transfer->exeType == EXE_GPU_GFX)
+  {
+    // Switch to executing GPU
+    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
+    HIP_CALL(hipSetDevice(exeIndex));
+
+    hipStream_t& stream     = exeInfo.streams[transferIdx];
+    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
+
+    // Figure out how many threadblocks to use.
+    // In single stream mode, all the threadblocks for this GPU are launched
+    // Otherwise, just launch the threadblocks associated with this single Transfer
+    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
+#if defined(__NVCC__)
+    HIP_CALL(hipEventRecord(startEvent, stream));
+    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
+    HIP_CALL(hipEventRecord(stopEvent, stream));
+#else
+    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
+                          dim3(numBlocksToRun, 1, 1),
+                          dim3(BLOCKSIZE, 1, 1),
+                          ev.sharedMemBytes, stream,
+                          startEvent, stopEvent,
+                          0, transfer->subExecParamGpuPtr);
+#endif
+    // Synchronize per iteration, unless in single sync mode, in which case
+    // synchronize during last warmup / last actual iteration
+    HIP_CALL(hipStreamSynchronize(stream));
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      float gpuDeltaMsec;
+      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+
+      if (ev.useSingleStream)
+      {
+        // Figure out individual timings for Transfers that were all launched together
+        for (Transfer* currTransfer : exeInfo.transfers)
+        {
+          long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle;
+          long long maxStopCycle  = currTransfer->subExecParamGpuPtr[0].stopCycle;
+          for (int i = 1; i < currTransfer->numSubExecs; i++)
+          {
+            minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle);
+            maxStopCycle  = std::max(maxStopCycle,  currTransfer->subExecParamGpuPtr[i].stopCycle);
+          }
+          int const wallClockRate = GetWallClockRate(exeIndex);
+          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
+          currTransfer->transferTime += iterationTimeMs;
+        }
+        exeInfo.totalTime += gpuDeltaMsec;
+      }
+      else
+      {
+        transfer->transferTime += gpuDeltaMsec;
+      }
+    }
+  }
+  else if (transfer->exeType == EXE_GPU_DMA)
+  {
+    // Switch to executing GPU
+    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
+    HIP_CALL(hipSetDevice(exeIndex));
+
+    hipStream_t& stream     = exeInfo.streams[transferIdx];
+    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
+
+    HIP_CALL(hipEventRecord(startEvent, stream));
+    if (transfer->numSrcs == 0 && transfer->numDsts == 1)
+    {
+      HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
+                              MEMSET_CHAR, transfer->numBytesActual, stream));
+    }
+    else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
+    {
+      HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
+                              transfer->numBytesActual, hipMemcpyDefault,
+                              stream));
+    }
+    HIP_CALL(hipEventRecord(stopEvent, stream));
+    HIP_CALL(hipStreamSynchronize(stream));
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      float gpuDeltaMsec;
+      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+      transfer->transferTime += gpuDeltaMsec;
+    }
+  }
+  else if (transfer->exeType == EXE_CPU) // CPU execution agent
+  {
+    // Force this thread and all child threads onto correct NUMA node
+    int const exeIndex = RemappedIndex(transfer->exeIndex, true);
+    if (numa_run_on_node(exeIndex))
+    {
+      printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
+      exit(1);
+    }
+
+    std::vector<std::thread> childThreads;
+
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Launch each subExecutor in child-threads to perform memcopies
+    for (int i = 0; i < transfer->numSubExecs; ++i)
+      childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
+
+    // Wait for child-threads to finish
+    for (int i = 0; i < transfer->numSubExecs; ++i)
+      childThreads[i].join();
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+
+    // Record time if not a warmup iteration
+    if (iteration >= 0)
+      transfer->transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+  }
+}
+
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
+{
+  ev.DisplayP2PBenchmarkEnvVars();
+
+  // Collect the number of available CPUs/GPUs on this machine
+  int const numCpus    = ev.numCpuDevices;
+  int const numGpus    = ev.numGpuDevices;
+  int const numDevices = numCpus + numGpus;
+
+  // Enable peer to peer for each GPU
+  for (int i = 0; i < numGpus; i++)
+    for (int j = 0; j < numGpus; j++)
+      if (i != j) EnablePeerAccess(i, j);
+
+  // Perform unidirectional / bidirectional
+  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
+  {
+    // Print header
+    if (!ev.outputToCsv)
+    {
+      printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
+             ev.useRemoteRead ? "Remote" : "Local",
+             ev.useRemoteRead ? "Local" : "Remote",
+             ev.useDmaCopy    ? "DMA"   : "GFX");
+
+      if (isBidirectional)
+      {
+        printf("%12s", "SRC\\DST");
+      }
+      else
+      {
+        if (ev.useRemoteRead)
+          printf("%12s", "SRC\\EXE+DST");
+        else
+          printf("%12s", "SRC+EXE\\DST");
+      }
+      for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i);
+      for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i);
+      printf("\n");
+    }
+
+    // Loop over all possible src/dst pairs
+    for (int src = 0; src < numDevices; src++)
+    {
+      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
+      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
+
+      if (!ev.outputToCsv)
+        printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
+
+      for (int dst = 0; dst < numDevices; dst++)
+      {
+        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
+        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
+
+        double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex);
+        if (!ev.outputToCsv)
+        {
+          if (bandwidth == 0)
+            printf("%10s", "N/A");
+          else
+            printf("%10.2f", bandwidth);
+        }
+        else
+        {
+          printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n",
+                 srcType == MEM_CPU ? "CPU" : "GPU", srcIndex,
+                 dstType == MEM_CPU ? "CPU" : "GPU", dstIndex,
+                 isBidirectional ? "bidirectional" : "unidirectional",
+                 ev.useRemoteRead ? "Remote" : "Local",
+                 ev.useDmaCopy ? "DMA" : "GFX",
+                 bandwidth,
+                 N * sizeof(float));
+        }
+        fflush(stdout);
+      }
+      if (!ev.outputToCsv) printf("\n");
+    }
+    if (!ev.outputToCsv) printf("\n");
+  }
+}
+
+double GetPeakBandwidth(EnvVars const& ev, size_t const N,
+                        int     const  isBidirectional,
+                        MemType const  srcType, int const srcIndex,
+                        MemType const  dstType, int const dstIndex)
+{
+  // Skip bidirectional on same device
+  if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
+
+  // Prepare Transfers
+  std::vector<Transfer> transfers(2);
+  transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
+
+  // SRC -> DST
+  transfers[0].numSrcs = transfers[0].numDsts = 1;
+  transfers[0].srcType.push_back(srcType);
+  transfers[0].dstType.push_back(dstType);
+  transfers[0].srcIndex.push_back(srcIndex);
+  transfers[0].dstIndex.push_back(dstIndex);
+
+  // DST -> SRC
+  transfers[1].numSrcs = transfers[1].numDsts = 1;
+  transfers[1].srcType.push_back(dstType);
+  transfers[1].dstType.push_back(srcType);
+  transfers[1].srcIndex.push_back(dstIndex);
+  transfers[1].dstIndex.push_back(srcIndex);
+
+  // Either perform (local read + remote write), or (remote read + local write)
+  ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+  transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
+  transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
+  transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
+  transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
+  transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+  transfers[1].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+
+  // Remove (DST->SRC) if not bidirectional
+  transfers.resize(isBidirectional + 1);
+
+  // Abort if executing on NUMA node with no CPUs
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
+      return 0;
+
+#if defined(__NVCC__)
+    // NVIDIA platform cannot access GPU memory directly from CPU executors
+    if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
+        return 0;
+#endif
+  }
+
+  ExecuteTransfers(ev, 0, N, transfers, false);
+
+  // Collect aggregate bandwidth
+  double totalBandwidth = 0;
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations);
+    double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+    totalBandwidth += transferBandwidthGbs;
+  }
+  return totalBandwidth;
+}
+
+void Transfer::PrepareSubExecParams(EnvVars const& ev)
+{
+  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
+  // Figure out the sub-array each subExecutor works on for this Transfer
+  // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
+  //   except the very last one, for alignment reasons
+  size_t const N              = this->numBytesActual / sizeof(float);
+  int    const initOffset     = ev.byteOffset / sizeof(float);
+  int    const targetMultiple = ev.blockBytes / sizeof(float);
+
+  // In some cases, there may not be enough data for all subExectors
+  int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
+  this->subExecParam.clear();
+  this->subExecParam.resize(this->numSubExecs);
+
+  size_t assigned = 0;
+  for (int i = 0; i < this->numSubExecs; ++i)
+  {
+    int    const subExecLeft = std::max(0, maxSubExecToUse - i);
+    size_t const leftover    = N - assigned;
+    size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
+
+    SubExecParam& p = this->subExecParam[i];
+    p.N             = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
+    p.numSrcs       = this->numSrcs;
+    p.numDsts       = this->numDsts;
+    for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc)
+      p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset;
+    for (int iDst = 0; iDst < this->numDsts; ++iDst)
+      p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
+
+    if (ev.enableDebug)
+    {
+      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
+             this->transferIndex, i, p.N, assigned, assigned + p.N);
+    }
+
+    p.startCycle = 0;
+    p.stopCycle  = 0;
+    assigned += p.N;
+  }
+
+  this->transferTime = 0.0;
+}
+
+void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
+{
+  size_t N = buffer.size();
+  if (bufferIdx >= 0)
+  {
+    size_t patternLen = ev.fillPattern.size();
+    if (patternLen > 0)
+    {
+      for (size_t i = 0; i < N; ++i)
+        buffer[i] = ev.fillPattern[i % patternLen];
+    }
+    else
+    {
+      for (size_t i = 0; i < N; ++i)
+        buffer[i] = PrepSrcValue(bufferIdx, i);
+    }
+  }
+  else // Destination buffer
+  {
+    if (this->numSrcs == 0)
+    {
+      // Note: 0x75757575 = 13323083.0
+      memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
+    }
+    else
+    {
+      PrepareReference(ev, buffer, 0);
+
+      if (this->numSrcs > 1)
+      {
+        std::vector<float> temp(N);
+        for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
+        {
+          PrepareReference(ev, temp, srcIdx);
+          for (int i = 0; i < N; ++i)
+          {
+            buffer[i] += temp[i];
+          }
+        }
+      }
+    }
+  }
+}
+
+bool Transfer::PrepareSrc(EnvVars const& ev)
+{
+  if (this->numSrcs == 0) return true;
+  size_t const N = this->numBytesActual / sizeof(float);
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  std::vector<float> reference(N);
+  for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
+  {
+    float* srcPtr = this->srcMem[srcIdx] + initOffset;
+    PrepareReference(ev, reference, srcIdx);
+
+    // Initialize source memory array with reference pattern
+    if (IsGpuType(this->srcType[srcIdx]))
+    {
+      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
+      HIP_CALL(hipSetDevice(deviceIdx));
+      if (ev.usePrepSrcKernel)
+        PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx);
+      else
+        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
+      HIP_CALL(hipDeviceSynchronize());
+    }
+    else if (IsCpuType(this->srcType[srcIdx]))
+    {
+      memcpy(srcPtr, reference.data(), this->numBytesActual);
+    }
+
+    // Perform check just to make sure that data has been copied properly
+    float* srcCheckPtr = srcPtr;
+    std::vector<float> srcCopy(N);
+    if (IsGpuType(this->srcType[srcIdx]))
+    {
+      if (!ev.validateDirect)
+      {
+        HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
+        HIP_CALL(hipDeviceSynchronize());
+        srcCheckPtr = srcCopy.data();
+      }
+    }
+
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (reference[i] != srcCheckPtr[i])
+      {
+        printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
+#if !defined(__NVCC__)
+        float const val = this->srcMem[srcIdx][initOffset + i];
+        printf("[ERROR] SRC %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+               srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
+#else
+        printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
+#endif
+        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
+        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
+               this->transferIndex,
+               this->SrcToStr().c_str(),
+               ExeTypeStr[this->exeType], this->exeIndex,
+               this->numSubExecs,
+               this->DstToStr().c_str());
+        if (!ev.continueOnError)
+          exit(1);
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void Transfer::ValidateDst(EnvVars const& ev)
+{
+  if (this->numDsts == 0) return;
+  size_t const N = this->numBytesActual / sizeof(float);
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  std::vector<float> reference(N);
+  PrepareReference(ev, reference, -1);
+
+  std::vector<float> hostBuffer(N);
+  for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
+  {
+    float* output;
+    if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
+    {
+      output = this->dstMem[dstIdx] + initOffset;
+    }
+    else
+    {
+      int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
+      HIP_CALL(hipSetDevice(deviceIdx));
+      HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
+      HIP_CALL(hipDeviceSynchronize());
+      output = hostBuffer.data();
+    }
+
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (reference[i] != output[i])
+      {
+        printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
+        for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
+        {
+          float srcVal;
+          HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
+#if !defined(__NVCC__)
+          float val = this->srcMem[srcIdx][initOffset + i];
+          printf("[ERROR] SRC %02dD  value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+                 srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
+#else
+          printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
+#endif
+        }
+        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
+#if !defined(__NVCC__)
+        float dstVal = this->dstMem[dstIdx][initOffset + i];
+        printf("[ERROR] DST %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+               dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
+#else
+        printf("[ERROR] DST %02d   value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
+#endif
+        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
+               this->transferIndex,
+               this->SrcToStr().c_str(),
+               ExeTypeStr[this->exeType], this->exeIndex,
+               this->numSubExecs,
+               this->DstToStr().c_str());
+        if (!ev.continueOnError)
+          exit(1);
+        else
+          break;
+      }
+    }
+  }
+}
+
+std::string Transfer::SrcToStr() const
+{
+  if (numSrcs == 0) return "N";
+  std::stringstream ss;
+  for (int i = 0; i < numSrcs; ++i)
+    ss << MemTypeStr[srcType[i]] << srcIndex[i];
+  return ss.str();
+}
+
+std::string Transfer::DstToStr() const
+{
+  if (numDsts == 0) return "N";
+  std::stringstream ss;
+  for (int i = 0; i < numDsts; ++i)
+    ss << MemTypeStr[dstType[i]] << dstIndex[i];
+  return ss.str();
+}
+
+// NOTE: This is a stop-gap solution until HIP provides wallclock values
+int GetWallClockRate(int deviceId)
+{
+  static std::vector<int> wallClockPerDeviceMhz;
+
+  if (wallClockPerDeviceMhz.size() == 0)
+  {
+    int numGpuDevices;
+    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+    wallClockPerDeviceMhz.resize(numGpuDevices);
+
+    for (int i = 0; i < numGpuDevices; i++)
+    {
+#if defined(__NVCC__)
+      int value = 1410000;
+      //HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
+      //value *= 1000;
+#else
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, i));
+      int value = 25000;
+      switch (prop.gcnArch)
+      {
+      case 906: case 910: value = 25000; break;
+      default:
+        printf("Unrecognized GCN arch %d\n", prop.gcnArch);
+      }
+#endif
+      wallClockPerDeviceMhz[i] = value;
+    }
+  }
+  return wallClockPerDeviceMhz[deviceId];
+}
+
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
+{
+  ev.DisplaySweepEnvVars();
+
+  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
+  std::vector<std::pair<ExeType, int>> exeList;
+  for (auto exe : ev.sweepExe)
+  {
+    ExeType const exeType = CharToExeType(exe);
+    if (IsGpuType(exeType))
+    {
+      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
+        exeList.push_back(std::make_pair(exeType, exeIndex));
+    }
+    else if (IsCpuType(exeType))
+    {
+      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
+      {
+        // Skip NUMA nodes that have no CPUs (e.g. CXL)
+        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
+        exeList.push_back(std::make_pair(exeType, exeIndex));
+      }
+    }
+  }
+  int numExes = exeList.size();
+
+  std::vector<std::pair<MemType, int>> srcList;
+  for (auto src : ev.sweepSrc)
+  {
+    MemType const srcType = CharToMemType(src);
+    int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
+
+    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
+      srcList.push_back(std::make_pair(srcType, srcIndex));
+  }
+  int numSrcs = srcList.size();
+
+
+  std::vector<std::pair<MemType, int>> dstList;
+  for (auto dst : ev.sweepDst)
+  {
+    MemType const dstType = CharToMemType(dst);
+    int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
+
+    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
+      dstList.push_back(std::make_pair(dstType, dstIndex));
+  }
+  int numDsts = dstList.size();
+
+  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
+  struct TransferInfo
+  {
+    MemType srcType; int srcIndex;
+    ExeType exeType; int exeIndex;
+    MemType dstType; int dstIndex;
+  };
+
+  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
+  bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
+
+  std::vector<TransferInfo> possibleTransfers;
+  TransferInfo tinfo;
+  for (int i = 0; i < numExes; ++i)
+  {
+    // Skip CPU executors if XGMI link must be used
+    if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
+    tinfo.exeType  = exeList[i].first;
+    tinfo.exeIndex = exeList[i].second;
+
+    bool isXgmiSrc  = false;
+    int  numHopsSrc = 0;
+    for (int j = 0; j < numSrcs; ++j)
+    {
+      if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
+      {
+        if (exeList[i].second != srcList[j].second)
+        {
+#if defined(__NVCC__)
+          isXgmiSrc = false;
+#else
+          uint32_t exeToSrcLinkType, exeToSrcHopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
+                                                RemappedIndex(srcList[j].second, false),
+                                                &exeToSrcLinkType,
+                                                &exeToSrcHopCount));
+          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
+#endif
+        }
+        else
+        {
+          isXgmiSrc = true;
+          numHopsSrc = 0;
+        }
+
+        // Skip this SRC if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiSrc) continue;
+
+        // Skip this SRC if XGMI distance is already past limit
+        if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
+      }
+      else if (useXgmiOnly) continue;
+
+      tinfo.srcType  = srcList[j].first;
+      tinfo.srcIndex = srcList[j].second;
+
+      bool isXgmiDst = false;
+      int  numHopsDst = 0;
+      for (int k = 0; k < numDsts; ++k)
+      {
+        if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
+        {
+          if (exeList[i].second != dstList[k].second)
+          {
+#if defined(__NVCC__)
+            isXgmiSrc = false;
+#else
+            uint32_t exeToDstLinkType, exeToDstHopCount;
+            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
+                                                  RemappedIndex(dstList[k].second, false),
+                                                  &exeToDstLinkType,
+                                                  &exeToDstHopCount));
+            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+            if (isXgmiDst) numHopsDst = exeToDstHopCount;
+#endif
+          }
+          else
+          {
+            isXgmiDst = true;
+            numHopsDst = 0;
+          }
+        }
+
+        // Skip this DST if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiDst) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
+        if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
+        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
+
+#if defined(__NVCC__)
+        // Skip CPU executors on GPU memory on NVIDIA platform
+        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
+          continue;
+#endif
+
+        tinfo.dstType  = dstList[k].first;
+        tinfo.dstIndex = dstList[k].second;
+
+        possibleTransfers.push_back(tinfo);
+      }
+    }
+  }
+
+  int const numPossible = (int)possibleTransfers.size();
+  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
+
+  if (ev.sweepMin > numPossible)
+  {
+    printf("No valid test configurations exist\n");
+    return;
+  }
+
+  if (ev.outputToCsv)
+  {
+    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
+           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
+  }
+
+  int numTestsRun = 0;
+  int M = ev.sweepMin;
+  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
+  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
+
+  // Log sweep to configuration file
+  FILE *fp = fopen("lastSweep.cfg", "w");
+  if (!fp)
+  {
+    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
+    exit(1);
+  }
+
+  // Create bitmask of numPossible triplets, of which M will be chosen
+  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
+  auto cpuStart = std::chrono::high_resolution_clock::now();
+  while (1)
+  {
+    if (isRandom)
+    {
+      // Pick random number of simultaneous transfers to execute
+      // NOTE: This currently skews distribution due to some #s having more possibilities than others
+      M = distribution(*ev.generator);
+
+      // Generate a random bitmask
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+      std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
+    }
+
+    // Convert bitmask to list of Transfers
+    std::vector<Transfer> transfers;
+    for (int value = 0; value < numPossible; ++value)
+    {
+      if (bitmask[value])
+      {
+        // Convert integer value to (SRC->EXE->DST) triplet
+        Transfer transfer;
+        transfer.numSrcs        = 1;
+        transfer.numDsts        = 1;
+        transfer.srcType        = {possibleTransfers[value].srcType};
+        transfer.srcIndex       = {possibleTransfers[value].srcIndex};
+        transfer.exeType        = possibleTransfers[value].exeType;
+        transfer.exeIndex       = possibleTransfers[value].exeIndex;
+        transfer.dstType        = {possibleTransfers[value].dstType};
+        transfer.dstIndex       = {possibleTransfers[value].dstIndex};
+        transfer.numSubExecs    = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
+        transfer.transferIndex  = transfers.size();
+        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
+        transfers.push_back(transfer);
+      }
+    }
+
+    LogTransfers(fp, ++numTestsRun, transfers);
+    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
+
+    // Check for test limit
+    if (numTestsRun == ev.sweepTestLimit)
+    {
+      printf("Test limit reached\n");
+      break;
+    }
+
+    // Check for time limit
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
+    {
+      printf("Time limit exceeded\n");
+      break;
+    }
+
+    // Increment bitmask if not random sweep
+    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
+    {
+      M++;
+      // Check for completion
+      if (M > maxParallelTransfers)
+      {
+        printf("Sweep complete\n");
+        break;
+      }
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+    }
+  }
+  fclose(fp);
+}
+
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
+{
+  fprintf(fp, "# Test %d\n", testNum);
+  fprintf(fp, "%d", -1 * (int)transfers.size());
+  for (auto const& transfer : transfers)
+  {
+    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
+            MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
+            ExeTypeStr[transfer.exeType],    transfer.exeIndex,
+            MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
+            transfer.numSubExecs,
+            transfer.numBytes);
+  }
+  fprintf(fp, "\n");
+  fflush(fp);
+}
+
+std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
+{
+  std::stringstream ss;
+  for (int i = 0; i < strVector.size(); ++i)
+  {
+    if (i) ss << " ";
+    ss << (strVector[i] + initOffset);
+  }
+  return ss.str();
+}
diff --git a/tfb.so/src/action.cpp b/tfb.so/src/action.cpp
new file mode 100644
index 00000000..0d0d4931
--- /dev/null
+++ b/tfb.so/src/action.cpp
@@ -0,0 +1,204 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include/action.h"
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <regex>
+#include <utility>
+#include <algorithm>
+#include <map>
+
+#define __HIP_PLATFORM_HCC__
+
+#include "include/rvs_key_def.h"
+#include "include/tfb_worker.h"
+#include "include/rvsactionbase.h"
+#include "include/rvsloglp.h"
+
+using std::string;
+using std::vector;
+using std::map;
+using std::regex;
+
+#define RVS_CONF_TFB_TYPE_KEY       "transfer-type"
+#define MODULE_NAME                 "transfer-bench"
+#define MODULE_NAME_CAPS            "TRANSFER-BENCH"
+#define TFB_DEFAULT_TYPE            "p2p"
+#define TFB_NO_COMPATIBLE_GPUS          "No AMD compatible GPU found!"
+
+#define FLOATING_POINT_REGEX            "^[0-9]*\\.?[0-9]+$"
+
+#define JSON_CREATE_NODE_ERROR          "JSON cannot create node"
+
+/**
+ * @brief default class constructor
+ */
+tfb_action::tfb_action() {
+    bjson = false;
+}
+
+/**
+ * @brief class destructor
+ */
+tfb_action::~tfb_action() {
+    property.clear();
+}
+
+/**
+ * @brief runs the hip test session
+ * @return true if no error occured, false otherwise
+ */
+bool tfb_action::start_tfb_runners() {
+    size_t k = 0;
+    // one worker sufficient, as test runner
+    tfbWorker worker;
+    worker.set_name(action_name);
+    worker.set_transfer_type(m_transfer_type);
+    worker.start();
+    worker.join();
+
+    return rvs::lp::Stopping() ? false : true;
+}
+
+/**
+ * @brief reads all GST-related configuration keys from
+ * the module's properties collection
+ * @return true if no fatal error occured, false otherwise
+ */
+bool tfb_action::get_all_tfb_config_keys(void) {
+    int error;
+    string msg;
+    bool bsts = true;
+
+    if (property_get<std::string>(RVS_CONF_TFB_TYPE_KEY, &m_transfer_type,
+            TFB_DEFAULT_TYPE)) {
+         msg = "invalid '" +
+         std::string(RVS_CONF_TFB_TYPE_KEY) + "' key value";
+         rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
+         bsts = false;
+    }
+
+    return bsts;
+}
+
+/**
+ * @brief reads all common configuration keys from
+ * the module's properties collection
+ * @return true if no fatal error occured, false otherwise
+ */
+bool tfb_action::get_all_common_config_keys(void) {
+    string msg, sdevid, sdev;
+    int error;
+    bool bsts = true;
+
+
+    // place holder for later
+
+    return bsts;
+}
+
+/**
+ * @brief gets the number of ROCm compatible AMD GPUs
+ * @return run number of GPUs
+ */
+int tfb_action::get_num_amd_gpu_devices(void) {
+    int hip_num_gpu_devices;
+    string msg;
+
+    hipGetDeviceCount(&hip_num_gpu_devices);
+    if (hip_num_gpu_devices == 0) {  // no AMD compatible GPU
+        msg = action_name + " " + MODULE_NAME + " " + TFB_NO_COMPATIBLE_GPUS;
+        rvs::lp::Log(msg, rvs::logerror);
+
+        if (bjson) {
+            unsigned int sec;
+            unsigned int usec;
+            rvs::lp::get_ticks(&sec, &usec);
+            void *json_root_node = rvs::lp::LogRecordCreate(MODULE_NAME,
+                            action_name.c_str(), rvs::loginfo, sec, usec, true);
+            if (!json_root_node) {
+                // log the error
+                string msg = std::string(JSON_CREATE_NODE_ERROR);
+                rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
+                return -1;
+            }
+
+            rvs::lp::AddString(json_root_node, "ERROR", TFB_NO_COMPATIBLE_GPUS);
+            rvs::lp::LogRecordFlush(json_root_node, rvs::loginfo);
+        }
+        return 0;
+    }
+    return hip_num_gpu_devices;
+}
+
+/**
+ * @brief gets all selected GPUs and starts the worker threads
+ * @return run result
+ */
+int tfb_action::run_transferbench(void) {
+    int hip_num_gpu_devices;
+    std::string msg;
+
+    hip_num_gpu_devices = get_num_amd_gpu_devices();
+     
+    if (hip_num_gpu_devices > 0) {
+        if (start_tfb_runners())
+            return 0;
+
+        return -1;
+    } else {
+      msg = "No devices match criteria from the test configuation.";
+      rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
+      return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * @brief runs the whole GST logic
+ * @return run result
+ */
+int tfb_action::run(void) {
+    string msg;
+
+    // get the action name
+    if (property_get(RVS_CONF_NAME_KEY, &action_name)) {
+      rvs::lp::Err("Action name missing", MODULE_NAME_CAPS);
+      return -1;
+    }
+
+    // check for -j flag (json logging)
+    if (property.find("cli.-j") != property.end())
+        bjson = true;
+    if (!get_all_common_config_keys())
+        return -1;
+    if (!get_all_tfb_config_keys())
+        return -1;
+
+    return run_transferbench();
+}
diff --git a/tfb.so/src/hiptest_worker.cpp b/tfb.so/src/hiptest_worker.cpp
new file mode 100644
index 00000000..9a31fc32
--- /dev/null
+++ b/tfb.so/src/hiptest_worker.cpp
@@ -0,0 +1,96 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include/hiptest_worker.h"
+
+#include <unistd.h>
+#include <string>
+#include <memory>
+#include <iostream>
+#include "include/rvs_blas.h"
+#include "include/rvs_module.h"
+#include "include/rvsloglp.h"
+#include "include/rvs_util.h"
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+
+#define MODULE_NAME                             "hiptest"
+
+using std::string;
+
+//bool hipTestWorker::bjson = false;
+
+hipTestWorker::hipTestWorker() {}
+hipTestWorker::~hipTestWorker() {}
+
+
+/**
+ * @brief performs the stress test on the given GPU
+ */
+void hipTestWorker::run() {
+    string msg, err_description;
+    int error = 0;
+
+
+    // log GST stress test - start message
+    msg = "[" + action_name + "] " + MODULE_NAME + " " +
+            " Starting the Hip test "; 
+    rvs::lp::Log(msg, rvs::logtrace);
+
+    // let the GPU ramp-up and check the result
+    bool hipsuccess = start_hip_tests(error, err_description);
+
+    // GPU was not able to do the processing (HIP/rocBlas error(s) occurred)
+    if (error) {
+        string msg = "[" + action_name + "] " + MODULE_NAME + " "
+                         + err_description;
+        rvs::lp::Log(msg, rvs::logerror);
+        return;
+    }
+
+}
+
+/**
+ * @brief forks and execs test result
+ * @param return true if test succeeded, false otherwise
+ */
+
+bool hipTestWorker::start_hip_tests(int &error, string &errdesc){
+    int pid, status;
+    auto found = m_test_path.find_last_of('/');
+    auto fname = m_test_path.substr(found+1);
+    if((pid = fork()) == 0){ // child
+	execl(m_test_path.c_str(), fname.c_str(), m_test_args.c_str(), 0);
+    }else{
+	waitpid(pid, &status, 0);
+	if (WIFEXITED(status)){
+	    error = 0;
+	    return true;
+	}
+	return false;
+    }
+}
diff --git a/tfb.so/src/rvs_module.cpp b/tfb.so/src/rvs_module.cpp
new file mode 100644
index 00000000..f96a16f1
--- /dev/null
+++ b/tfb.so/src/rvs_module.cpp
@@ -0,0 +1,91 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include/rvs_module.h"
+#include "include/action.h"
+#include "include/rvsloglp.h"
+#include "include/gpu_util.h"
+
+/**
+ * @defgroup GST GST Module
+ *
+ * @brief performs GPU Stress Test
+ *
+ * The GPU Stress Test runs a Graphics Stress test or SGEMM/DGEMM
+ * (Single/Double-precision General Matrix Multiplication) workload
+ * on one, some or all GPUs. The GPUs can be of the same or different types.
+ * The duration of the benchmark should be configurable, both in terms of time
+ * (how long to run) and iterations (how many times to run).
+ * 
+ */
+
+extern "C" int rvs_module_has_interface(int iid) {
+  int sts = 0;
+  switch (iid) {
+  case 0:
+  case 1:
+    sts = 1;
+  }
+  return sts;
+}
+
+extern "C" const char* rvs_module_get_description(void) {
+    return "ROCm Validation Suite HipTest module";
+}
+
+extern "C" const char* rvs_module_get_config(void) {
+    return "test_path (string)" ;
+}
+
+extern "C" const char* rvs_module_get_output(void) {
+    return "pass (bool)";
+}
+
+extern "C" int rvs_module_init(void* pMi) {
+    rvs::lp::Initialize(static_cast<T_MODULE_INIT*>(pMi));
+    rvs::gpulist::Initialize();
+    return 0;
+}
+
+extern "C" int rvs_module_terminate(void) {
+    return 0;
+}
+
+extern "C" void* rvs_module_action_create(void) {
+    return static_cast<void*>(new tfb_action);
+}
+
+extern "C" int   rvs_module_action_destroy(void* pAction) {
+    delete static_cast<rvs::actionbase*>(pAction);
+    return 0;
+}
+
+extern "C" int rvs_module_action_property_set(void* pAction, const char* Key,
+                                                            const char* Val) {
+    return static_cast<rvs::actionbase*>(pAction)->property_set(Key, Val);
+}
+
+extern "C" int rvs_module_action_run(void* pAction) {
+    return static_cast<rvs::actionbase*>(pAction)->run();
+}
diff --git a/tfb.so/src/tfb_worker.cpp b/tfb.so/src/tfb_worker.cpp
new file mode 100644
index 00000000..5ec11185
--- /dev/null
+++ b/tfb.so/src/tfb_worker.cpp
@@ -0,0 +1,104 @@
+/*
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// This program measures simultaneous copy performance across multiple GPUs
+// on the same node
+#include <numa.h>
+#include <numaif.h>
+#include <random>
+#include <stack>
+#include <thread>
+#include "include/tfb_worker.h"
+#include "include/rvsloglp.h"
+#include "include/TransferBench.hpp"
+#include "include/GetClosestNumaNode.hpp"
+
+static const std::string MODULE_NAME{"transfer-bench"};
+/**
+ * @brief performs the stress test on the given GPU
+ */
+void tfbWorker::run() {
+	std::string msg, err_description;
+    int error = 0;
+
+
+    // log GST stress test - start message
+    msg = "[" + action_name + "] " + MODULE_NAME + " " +
+            " Starting the Hip test ";
+    rvs::lp::Log(msg, rvs::logtrace);
+
+    // let the GPU ramp-up and check the result
+   int sts = TfbRun(error, err_description);
+
+    // GPU was not able to do the processing (HIP/rocBlas error(s) occurred)
+    if (error) {
+	    std::string msg = "[" + action_name + "] " + MODULE_NAME + " "
+                         + err_description;
+        rvs::lp::Log(msg, rvs::logerror);
+        return;
+    }
+
+}
+
+
+int tfbWorker::TfbRun(int &err, std::string &errmsg)
+{
+  // Check for NUMA library support
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+
+
+  // Collect environment variables / display current run configuration
+  EnvVars ev;
+
+  // Determine number of bytes to run per Transfer
+  //size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
+  size_t numBytesPerTransfer = DEFAULT_BYTES_PER_TRANSFER*1024*1024;
+  /*
+  if (argc > 2)
+  {
+    // Adjust bytes if unit specified
+    char units = argv[2][strlen(argv[2])-1];
+    switch (units)
+    {
+    case 'K': case 'k': numBytesPerTransfer *= 1024; break;
+    case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
+    case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
+    }
+  }*/
+  if (numBytesPerTransfer % 4)
+  {
+    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
+    exit(1);
+  }
+
+    ev.configMode = CFG_P2P;
+    RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float));
+
+
+  return 0;
+}
+
+
diff --git a/tfb.so/tests.cmake b/tfb.so/tests.cmake
new file mode 100644
index 00000000..c1b6b74e
--- /dev/null
+++ b/tfb.so/tests.cmake
@@ -0,0 +1,27 @@
+################################################################################
+##
+## Copyright (c) 2018 ROCm Developer Tools
+##
+## MIT LICENSE:
+## Permission is hereby granted, free of charge, to any person obtaining a copy of
+## this software and associated documentation files (the "Software"), to deal in
+## the Software without restriction, including without limitation the rights to
+## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+## of the Software, and to permit persons to whom the Software is furnished to do
+## so, subject to the following conditions:
+##
+## The above copyright notice and this permission notice shall be included in all
+## copies or substantial portions of the Software.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+## SOFTWARE.
+##
+################################################################################
+
+
+include(tests_conf_logging)
diff --git a/tfb.so/transferbench/CMakeLists.txt b/tfb.so/transferbench/CMakeLists.txt
new file mode 100644
index 00000000..18df727b
--- /dev/null
+++ b/tfb.so/transferbench/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+if (DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
+else()
+    set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory")
+endif()
+cmake_minimum_required(VERSION 3.5)
+
+project(TransferBench VERSION 1.0 LANGUAGES CXX)
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
+include_directories(${ROCM_PATH}/include)
+link_libraries(numa hsa-runtime64 pthread)
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..)
+add_executable(TransferBench src/TransferBench.cpp)
+target_include_directories(TransferBench PRIVATE src/include)
+
+find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
+include(ROCMInstallTargets)
+include(ROCMCreatePackage)
+set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF)
+
+set(PACKAGE_NAME TB)
+set(LIBRARY_NAME TransferBench)
+
+rocm_install(TARGETS TransferBench)
+
+rocm_create_package(
+    NAME ${LIBRARY_NAME}
+    DESCRIPTION "TransferBench package"
+    MAINTAINER "RCCL Team <gilbert.lee@amd.com>"
+)

From 21c45c876fc77bd577406c2b2762175d49622b4c Mon Sep 17 00:00:00 2001
From: Manoj S K <Manoj.SK@amd.com>
Date: Thu, 24 Aug 2023 07:07:13 +0000
Subject: [PATCH 7/7] remove ransferbench redundancy

---
 transferbench.so/CMakeLists.txt               |  183 --
 transferbench.so/include/Compatibility.hpp    |   93 -
 transferbench.so/include/EnvVars.hpp          |  510 -----
 .../include/GetClosestNumaNode.hpp            |  149 --
 transferbench.so/include/Kernels.hpp          |  409 ----
 transferbench.so/include/TransferBench.hpp    |  198 --
 transferbench.so/include/action.h             |   86 -
 transferbench.so/include/rvs_module.h         |   31 -
 transferbench.so/include/tfb_worker.h         |   72 -
 transferbench.so/src/.gitignore               |    1 -
 transferbench.so/src/TransferBench.cpp        | 1814 -----------------
 transferbench.so/src/action.cpp               |  204 --
 transferbench.so/src/hiptest_worker.cpp       |   96 -
 transferbench.so/src/rvs_module.cpp           |   91 -
 transferbench.so/src/tfb_worker.cpp           |  104 -
 transferbench.so/tests.cmake                  |   27 -
 transferbench.so/transferbench/CMakeLists.txt |   31 -
 17 files changed, 4099 deletions(-)
 delete mode 100644 transferbench.so/CMakeLists.txt
 delete mode 100644 transferbench.so/include/Compatibility.hpp
 delete mode 100644 transferbench.so/include/EnvVars.hpp
 delete mode 100644 transferbench.so/include/GetClosestNumaNode.hpp
 delete mode 100644 transferbench.so/include/Kernels.hpp
 delete mode 100644 transferbench.so/include/TransferBench.hpp
 delete mode 100644 transferbench.so/include/action.h
 delete mode 100644 transferbench.so/include/rvs_module.h
 delete mode 100644 transferbench.so/include/tfb_worker.h
 delete mode 100644 transferbench.so/src/.gitignore
 delete mode 100644 transferbench.so/src/TransferBench.cpp
 delete mode 100644 transferbench.so/src/action.cpp
 delete mode 100644 transferbench.so/src/hiptest_worker.cpp
 delete mode 100644 transferbench.so/src/rvs_module.cpp
 delete mode 100644 transferbench.so/src/tfb_worker.cpp
 delete mode 100644 transferbench.so/tests.cmake
 delete mode 100644 transferbench.so/transferbench/CMakeLists.txt

diff --git a/transferbench.so/CMakeLists.txt b/transferbench.so/CMakeLists.txt
deleted file mode 100644
index ee2002d9..00000000
--- a/transferbench.so/CMakeLists.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-################################################################################
-##
-## Copyright (c) 2018 ROCm Developer Tools
-##
-## MIT LICENSE:
-## Permission is hereby granted, free of charge, to any person obtaining a copy of
-## this software and associated documentation files (the "Software"), to deal in
-## the Software without restriction, including without limitation the rights to
-## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-## of the Software, and to permit persons to whom the Software is furnished to do
-## so, subject to the following conditions:
-##
-## The above copyright notice and this permission notice shall be included in all
-## copies or substantial portions of the Software.
-##
-## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-## SOFTWARE.
-##
-################################################################################
-
-cmake_minimum_required ( VERSION 3.5.0 )
-if ( ${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
-  message(FATAL "In-source build is not allowed")
-endif ()
-set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
-
-set ( RVS "tfb" )
-set ( RVS_PACKAGE "rvs-roct" )
-set ( RVS_COMPONENT "lib${RVS}" )
-set ( RVS_TARGET "${RVS}" )
-
-project ( ${RVS_TARGET} )
-
-message(STATUS "MODULE: ${RVS}")
-add_compile_options(-std=c++11)
-add_compile_options(-Wall )
-if (RVS_COVERAGE)
-  add_compile_options(-o0 -fprofile-arcs -ftest-coverage)
-  set(CMAKE_EXE_LINKER_FLAGS "--coverage")
-  set(CMAKE_SHARED_LINKER_FLAGS "--coverage")
-endif()
-
-## Set default module path if not already set
-if ( NOT DEFINED CMAKE_MODULE_PATH )
-    set ( CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake_modules/" )
-endif ()
-
-## Include common cmake modules
-include ( utils )
-
-## Setup the package version.
-get_version ( "0.0.0" )
-
-set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} )
-set ( BUILD_VERSION_MINOR ${VERSION_MINOR} )
-set ( BUILD_VERSION_PATCH ${VERSION_PATCH} )
-set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
-
-if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" )
-    set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" )
-endif ()
-set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
-
-## make version numbers visible to C code
-add_compile_options(-DBUILD_VERSION_MAJOR=${VERSION_MAJOR})
-add_compile_options(-DBUILD_VERSION_MINOR=${VERSION_MINOR})
-add_compile_options(-DBUILD_VERSION_PATCH=${VERSION_PATCH})
-add_compile_options(-DLIB_VERSION_STRING="${LIB_VERSION_STRING}")
-add_compile_options(-DBUILD_VERSION_STRING="${BUILD_VERSION_STRING}")
-
-set(ROCBLAS_LIB "rocblas")
-set(HIP_HCC_LIB "amdhip64")
-
-# Determine HSA_PATH
-if(NOT DEFINED HIPCC_PATH)
-  if(NOT DEFINED ENV{HIPCC_PATH})
-    set(HIPCC_PATH "${ROCM_PATH}/hip" CACHE PATH "Path to which hipcc runtime has been installed")
-     else()
-       set(HIPCC_PATH $ENV{HIPCC_PATH} CACHE PATH "Path to which hipcc runtime has been installed")
-     endif()
-endif()
-
-# Determine HSA_PATH
-if(NOT DEFINED HSA_PATH)
-     if(NOT DEFINED ENV{HSA_PATH})
-	     set(HSA_PATH "${ROCM_PATH}/hsa" CACHE PATH "Path to which HSA runtime has been installed")
-     else()
-          set(HSA_PATH $ENV{HSA_PATH} CACHE PATH "Path to which HSA runtime has been installed")
-     endif()
-endif()
-
-# Add HIP_VERSION to CMAKE_<LANG>_FLAGS
-set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}")
-
-set(HIP_HCC_BUILD_FLAGS)
-set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include ${ASAN_CXX_FLAGS}")
-
-# Set compiler and compiler flags
-set(CMAKE_CXX_COMPILER "${HIPCC_PATH}/bin/hipcc")
-set(CMAKE_C_COMPILER   "${HIPCC_PATH}/bin/hipcc")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS} -L${ROCM_PATH}/lib")
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
-set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
-set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
-
-if(BUILD_ADDRESS_SANITIZER)
-  execute_process(COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.asan-x86_64.so
-            OUTPUT_VARIABLE ASAN_LIB_FULL_PATH)
-  get_filename_component(ASAN_LIB_PATH ${ASAN_LIB_FULL_PATH} DIRECTORY)
-endif()
-
-# Determine Roc Runtime header files are accessible
-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h)
-  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
-  RETURN()
-endif()
-
-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h)
-  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
-  RETURN()
-endif()
-
-# Determine Roc Runtime header files are accessible
-if(DEFINED RVS_ROCMSMI)
-  if(NOT RVS_ROCMSMI EQUAL 1)
-    if(NOT EXISTS ${ROCBLAS_INC_DIR}/rocblas.h)
-    message("ERROR: rocBLAS headers can't be found under specified path. Please set ROCBLAS_INC_DIR path. Current value is : " ${ROCBLAS_INC_DIR})
-    RETURN()
-    endif()
-
-    if(NOT EXISTS "${ROCBLAS_LIB_DIR}/lib${ROCBLAS_LIB}.so")
-      message("ERROR: rocBLAS library can't be found under specified path. Please set ROCBLAS_LIB_DIR path. Current value is : " ${ROCBLAS_LIB_DIR})
-      RETURN()
-    endif()
-  endif()
-endif()
-
-
-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
-  message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
-  RETURN()
-endif()
-
-## define include directories
-include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${ROCM_PATH}/include)
-# Add directories to look for library files to link
-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
-## additional libraries
-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so numa hsa-runtime64 )
-
-## define source files
-set(SOURCES src/rvs_module.cpp src/action.cpp src/tfb_worker.cpp)
-
-## define target
-add_library( ${RVS_TARGET} SHARED ${SOURCES})
-set_target_properties(${RVS_TARGET} PROPERTIES
-        SUFFIX .so.${LIB_VERSION_STRING}
-        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
-#add_dependencies(${RVS_TARGET} rvslibrt rvslib)
-add_dependencies(${RVS_TARGET} rvslib)
-
-add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
-COMMAND ln -fs ./lib${RVS}.so.${LIB_VERSION_STRING} lib${RVS}.so.${VERSION_MAJOR} WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
-COMMAND ln -fs ./lib${RVS}.so.${VERSION_MAJOR} lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
-)
-
-install(TARGETS ${RVS_TARGET} LIBRARY DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
-install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR}" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
-install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
-
-# TEST SECTION
-#if (RVS_BUILD_TESTS)
-#  add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
-#  COMMAND ln -fs ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR} ${RVS_BINTEST_FOLDER}/lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
-#  )
-#  include(${CMAKE_CURRENT_SOURCE_DIR}/tests.cmake)
-#endif()
diff --git a/transferbench.so/include/Compatibility.hpp b/transferbench.so/include/Compatibility.hpp
deleted file mode 100644
index 5e76cf50..00000000
--- a/transferbench.so/include/Compatibility.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#pragma once
-
-#if defined(__NVCC__)
-
-#include <cuda_runtime.h>
-
-// ROCm specific
-#define __builtin_amdgcn_s_memrealtime                     clock64
-#define gcnArchName                                        name
-
-// Datatypes
-#define hipDeviceProp_t                                    cudaDeviceProp
-#define hipError_t                                         cudaError_t
-#define hipEvent_t                                         cudaEvent_t
-#define hipStream_t                                        cudaStream_t
-
-// Enumerations
-#define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
-#define hipDeviceAttributeMaxSharedMemoryPerMultiprocessor cudaDevAttrMaxSharedMemoryPerMultiprocessor
-#define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
-#define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
-#define hipFuncCachePreferShared                           cudaFuncCachePreferShared
-#define hipMemcpyDefault                                   cudaMemcpyDefault
-#define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
-#define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
-#define hipSuccess                                         cudaSuccess
-
-// Functions
-#define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
-#define hipDeviceEnablePeerAccess                          cudaDeviceEnablePeerAccess
-#define hipDeviceGetAttribute                              cudaDeviceGetAttribute
-#define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
-#define hipDeviceSetCacheConfig                            cudaDeviceSetCacheConfig
-#define hipDeviceSynchronize                               cudaDeviceSynchronize
-#define hipEventCreate                                     cudaEventCreate
-#define hipEventDestroy                                    cudaEventDestroy
-#define hipEventElapsedTime                                cudaEventElapsedTime
-#define hipEventRecord                                     cudaEventRecord
-#define hipFree                                            cudaFree
-#define hipGetDeviceCount                                  cudaGetDeviceCount
-#define hipGetDeviceProperties                             cudaGetDeviceProperties
-#define hipGetErrorString                                  cudaGetErrorString
-#define hipHostFree                                        cudaFreeHost
-#define hipHostMalloc                                      cudaMallocHost
-#define hipMalloc                                          cudaMalloc
-#define hipMemcpy                                          cudaMemcpy
-#define hipMemcpyAsync                                     cudaMemcpyAsync
-#define hipMemset                                          cudaMemset
-#define hipMemsetAsync                                     cudaMemsetAsync
-#define hipSetDevice                                       cudaSetDevice
-#define hipStreamCreate                                    cudaStreamCreate
-#define hipStreamDestroy                                   cudaStreamDestroy
-#define hipStreamSynchronize                               cudaStreamSynchronize
-
-// Define float4 addition operator for NVIDIA platform
-__device__ inline float4& operator +=(float4& a, const float4& b)
-{
-  a.x += b.x;
-  a.y += b.y;
-  a.z += b.z;
-  a.w += b.w;
-  return a;
-}
-
-#else
-
-#include <hip/hip_ext.h>
-#include <hip/hip_runtime.h>
-#include <hsa/hsa_ext_amd.h>
-
-#endif
diff --git a/transferbench.so/include/EnvVars.hpp b/transferbench.so/include/EnvVars.hpp
deleted file mode 100644
index bac0746e..00000000
--- a/transferbench.so/include/EnvVars.hpp
+++ /dev/null
@@ -1,510 +0,0 @@
-/*
-Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#ifndef ENVVARS_HPP
-#define ENVVARS_HPP
-
-#include <algorithm>
-#include <random>
-#include <time.h>
-#include "Compatibility.hpp"
-#include "Kernels.hpp"
-
-#define TB_VERSION "1.22"
-
-extern char const MemTypeStr[];
-extern char const ExeTypeStr[];
-
-enum ConfigModeEnum
-{
-  CFG_FILE  = 0,
-  CFG_P2P   = 1,
-  CFG_SWEEP = 2
-};
-
-// This class manages environment variable that affect TransferBench
-class EnvVars
-{
-public:
-  // Default configuration values
-  int const DEFAULT_NUM_WARMUPS          =  1;
-  int const DEFAULT_NUM_ITERATIONS       = 10;
-  int const DEFAULT_SAMPLING_FACTOR      =  1;
-
-  // Peer-to-peer Benchmark preset defaults
-  int const DEFAULT_P2P_NUM_CPU_SE    = 4;
-
-  // Sweep-preset defaults
-  std::string const DEFAULT_SWEEP_SRC = "CG";
-  std::string const DEFAULT_SWEEP_EXE = "CDG";
-  std::string const DEFAULT_SWEEP_DST = "CG";
-  int const DEFAULT_SWEEP_MIN         = 1;
-  int const DEFAULT_SWEEP_MAX         = 24;
-  int const DEFAULT_SWEEP_TEST_LIMIT  = 0;
-  int const DEFAULT_SWEEP_TIME_LIMIT  = 0;
-
-  // Environment variables
-  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
-  int byteOffset;        // Byte-offset for memory allocations
-  int continueOnError;   // Continue tests even after mismatch detected
-  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
-  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
-  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
-  int numWarmups;        // Number of un-timed warmup iterations to perform
-  int outputToCsv;       // Output in CSV format
-  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
-  int sharedMemBytes;    // Amount of shared memory to use per threadblock
-  int useInteractive;    // Pause for user-input before starting transfer loop
-  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
-  int usePrepSrcKernel;  // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
-  int useSingleStream;   // Use a single stream per GPU GFX executor instead of stream per Transfer
-  int validateDirect;    // Validate GPU destination memory directly instead of staging GPU memory on host
-
-  std::vector<float> fillPattern; // Pattern of floats used to fill source data
-
-  // Environment variables only for Benchmark-preset
-  int useRemoteRead;     // Use destination memory type as executor instead of source memory type
-  int useDmaCopy;        // Use DMA copy instead of GPU copy
-  int numGpuSubExecs;    // Number of GPU subexecutors to use
-  int numCpuSubExecs;    // Number of CPU subexecttors to use
-
-  // Environment variables only for Sweep-preset
-  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
-  int sweepMax;          // Max number of simulatneous Transfers to be executed per test
-  int sweepTestLimit;    // Max number of tests to run during sweep (0 = no limit)
-  int sweepTimeLimit;    // Max number of seconds to run sweep for  (0 = no limit)
-  int sweepXgmiMin;      // Min number of XGMI hops for Transfers
-  int sweepXgmiMax;      // Max number of XGMI hops for Transfers (-1 = no limit)
-  int sweepSeed;         // Random seed to use
-  int sweepRandBytes;    // Whether or not to use random number of bytes per Transfer
-  std::string sweepSrc;  // Set of src memory types to be swept
-  std::string sweepExe;  // Set of executors to be swept
-  std::string sweepDst;  // Set of dst memory types to be swept
-
-  // Developer features
-  int enableDebug;       // Enable debug output
-  int gpuKernel;         // Which GPU kernel to use
-
-  // Used to track current configuration mode
-  ConfigModeEnum configMode;
-
-  // Random generator
-  std::default_random_engine *generator;
-
-  // Track how many CPUs are available per NUMA node
-  std::vector<int> numCpusPerNuma;
-
-  // Constructor that collects values
-  EnvVars()
-  {
-    int maxSharedMemBytes = 0;
-    HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes,
-                                   hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0));
-#if !defined(__NVCC__)
-    int defaultSharedMemBytes = maxSharedMemBytes / 2 + 1;
-#else
-    int defaultSharedMemBytes = 0;
-#endif
-
-    int numDeviceCUs = 0;
-    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0));
-
-    int numDetectedCpus = numa_num_configured_nodes();
-    int numDetectedGpus;
-    HIP_CALL(hipGetDeviceCount(&numDetectedGpus));
-
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, 0));
-    std::string fullName = prop.gcnArchName;
-    std::string archName = fullName.substr(0, fullName.find(':'));
-
-    // Different hardware pick different GPU kernels
-    // This performance difference is generally only noticable when executing fewer CUs
-    int defaultGpuKernel = 0;
-    if      (archName == "gfx906") defaultGpuKernel = 13;
-    else if (archName == "gfx90a") defaultGpuKernel = 9;
-
-    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
-    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
-    continueOnError   = GetEnvVar("CONTINUE_ON_ERROR"   , 0);
-    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
-    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
-    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
-    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
-    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
-    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
-    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , defaultSharedMemBytes);
-    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
-    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
-    usePrepSrcKernel  = GetEnvVar("USE_PREP_KERNEL"     , 0);
-    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);
-    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
-    enableDebug       = GetEnvVar("DEBUG"               , 0);
-    gpuKernel         = GetEnvVar("GPU_KERNEL"          , defaultGpuKernel);
-
-    // P2P Benchmark related
-    useRemoteRead    = GetEnvVar("USE_REMOTE_READ"      , 0);
-    useDmaCopy       = GetEnvVar("USE_GPU_DMA"          , 0);
-    numGpuSubExecs   = GetEnvVar("NUM_GPU_SE"           , useDmaCopy ? 1 : numDeviceCUs);
-    numCpuSubExecs   = GetEnvVar("NUM_CPU_SE"           , DEFAULT_P2P_NUM_CPU_SE);
-
-    // Sweep related
-    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
-    sweepMax          = GetEnvVar("SWEEP_MAX"           , DEFAULT_SWEEP_MAX);
-    sweepSrc          = GetEnvVar("SWEEP_SRC"           , DEFAULT_SWEEP_SRC);
-    sweepExe          = GetEnvVar("SWEEP_EXE"           , DEFAULT_SWEEP_EXE);
-    sweepDst          = GetEnvVar("SWEEP_DST"           , DEFAULT_SWEEP_DST);
-    sweepTestLimit    = GetEnvVar("SWEEP_TEST_LIMIT"    , DEFAULT_SWEEP_TEST_LIMIT);
-    sweepTimeLimit    = GetEnvVar("SWEEP_TIME_LIMIT"    , DEFAULT_SWEEP_TIME_LIMIT);
-    sweepXgmiMin      = GetEnvVar("SWEEP_XGMI_MIN"      , 0);
-    sweepXgmiMax      = GetEnvVar("SWEEP_XGMI_MAX"      , -1);
-    sweepRandBytes    = GetEnvVar("SWEEP_RAND_BYTES"    , 0);
-
-    // Determine random seed
-    char *sweepSeedStr = getenv("SWEEP_SEED");
-    sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL));
-    generator = new std::default_random_engine(sweepSeed);
-
-    // Check for fill pattern
-    char* pattern = getenv("FILL_PATTERN");
-    if (pattern != NULL)
-    {
-      if (usePrepSrcKernel)
-      {
-        printf("[ERROR] Unable to use FILL_PATTERN and USE_PREP_KERNEL together\n");
-        exit(1);
-      }
-
-      int patternLen = strlen(pattern);
-      if (patternLen % 2)
-      {
-        printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
-        exit(1);
-      }
-
-      // Read in bytes
-      std::vector<unsigned char> bytes;
-      unsigned char val = 0;
-      for (int i = 0; i < patternLen; i++)
-      {
-        if ('0' <= pattern[i] && pattern[i] <= '9')
-          val += (pattern[i] - '0');
-        else if ('A' <= pattern[i] && pattern[i] <= 'F')
-          val += (pattern[i] - 'A' + 10);
-        else if ('a' <= pattern[i] && pattern[i] <= 'f')
-          val += (pattern[i] - 'a' + 10);
-        else
-        {
-          printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F).  (not %c)\n", pattern[i]);
-          exit(1);
-        }
-
-        if (i % 2 == 0)
-          val <<= 4;
-        else
-        {
-          bytes.push_back(val);
-          val = 0;
-        }
-      }
-
-      // Reverse bytes (input is assumed to be given in big-endian)
-      std::reverse(bytes.begin(), bytes.end());
-
-      // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
-      int copies;
-      switch (patternLen % 8)
-      {
-      case 0:  copies = 1; break;
-      case 4:  copies = 2; break;
-      default: copies = 4; break;
-      }
-
-      // Fill floats
-      int numFloats = copies * patternLen / 8;
-      fillPattern.resize(numFloats);
-      unsigned char* rawData = (unsigned char*) fillPattern.data();
-      for (int i = 0; i < numFloats * 4; i++)
-        rawData[i] = bytes[i % bytes.size()];
-    }
-    else fillPattern.clear();
-
-    // Perform some basic validation
-    if (numCpuDevices > numDetectedCpus)
-    {
-      printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
-      exit(1);
-    }
-    if (numGpuDevices > numDetectedGpus)
-    {
-      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
-      exit(1);
-    }
-    if (byteOffset % sizeof(float))
-    {
-      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
-      exit(1);
-    }
-    if (numWarmups < 0)
-    {
-      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
-      exit(1);
-    }
-    if (samplingFactor < 1)
-    {
-      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
-      exit(1);
-    }
-    if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
-    {
-      printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes);
-      exit(1);
-    }
-    if (blockBytes <= 0 || blockBytes % 4)
-    {
-      printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
-      exit(1);
-    }
-
-    if (numGpuSubExecs <= 0)
-    {
-      printf("[ERROR] NUM_GPU_SE must be greater than 0\n");
-      exit(1);
-    }
-
-    if (numCpuSubExecs <= 0)
-    {
-      printf("[ERROR] NUM_CPU_SE must be greater than 0\n");
-      exit(1);
-    }
-
-    for (auto ch : sweepSrc)
-    {
-      if (!strchr(MemTypeStr, ch))
-      {
-        printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
-        exit(1);
-      }
-      if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
-      {
-        printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
-        exit(1);
-      }
-    }
-
-    for (auto ch : sweepDst)
-    {
-      if (!strchr(MemTypeStr, ch))
-      {
-        printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
-        exit(1);
-      }
-      if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
-      {
-        printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
-        exit(1);
-      }
-    }
-
-    for (auto ch : sweepExe)
-    {
-      if (!strchr(ExeTypeStr, ch))
-      {
-        printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
-        exit(1);
-      }
-      if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
-      {
-        printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
-        exit(1);
-      }
-    }
-    if (gpuKernel < 0 || gpuKernel > NUM_GPU_KERNELS)
-    {
-      printf("[ERROR] GPU kernel must be between 0 and %d\n", NUM_GPU_KERNELS);
-      exit(1);
-    }
-
-    // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
-    numCpusPerNuma.resize(numDetectedCpus);
-    int const totalCpus = numa_num_configured_cpus();
-    for (int i = 0; i < totalCpus; i++)
-      numCpusPerNuma[numa_node_of_cpu(i)]++;
-
-    // Check for deprecated env vars
-    if (getenv("USE_HIP_CALL"))
-    {
-      printf("[WARN] USE_HIP_CALL has been deprecated.  Please use DMA executor 'D' or set USE_GPU_DMA for P2P-Benchmark preset\n");
-      exit(1);
-    }
-
-    char* enableSdma = getenv("HSA_ENABLE_SDMA");
-    if (enableSdma && !strcmp(enableSdma, "0"))
-    {
-      printf("[WARN] DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0.  Copies will fallback to blit kernels\n");
-    }
-  }
-
-  // Display info on the env vars that can be used
-  static void DisplayUsage()
-  {
-    printf("Environment variables:\n");
-    printf("======================\n");
-    printf(" BLOCK_BYTES=B          - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
-    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
-    printf(" CONTINUE_ON_ERROR      - Continue tests even after mismatch detected\n");
-    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
-    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
-    printf(" NUM_GPU_DEVICES=X      - Restrict number of GPUs to X.  May not be greater than # detected HIP devices\n");
-    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
-    printf(" NUM_WARMUPS=W          - Perform W untimed warmup iteration(s) per test\n");
-    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
-    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
-    printf(" SHARED_MEM_BYTES=X     - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
-    printf(" USE_INTERACTIVE        - Pause for user-input before starting transfer loop\n");
-    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
-    printf(" USE_PREP_KERNEL        - Use GPU kernel to initialize source data array pattern\n");
-    printf(" USE_SINGLE_STREAM      - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
-    printf(" VALIDATE_DIRECT        - Validate GPU destination memory directly instead of staging GPU memory on host\n");
-  }
-
-  // Helper macro to switch between CSV and terminal output
-#define PRINT_EV(NAME, VALUE, DESCRIPTION)                              \
-  printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
-
-#define PRINT_ES(NAME, VALUE, DESCRIPTION)                           \
-  printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
-
-  // Display env var settings
-  void DisplayEnvVars() const
-  {
-    if (!outputToCsv)
-    {
-      printf("TransferBench v%s\n", TB_VERSION);
-      printf("=====================================================\n");
-      printf("[Common]\n");
-    }
-    else
-      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
-
-    PRINT_EV("BLOCK_BYTES", blockBytes,
-             std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
-    PRINT_EV("BYTE_OFFSET", byteOffset,
-             std::string("Using byte offset of " + std::to_string(byteOffset)));
-    PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
-             std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
-    PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
-             (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
-    PRINT_EV("GPU_KERNEL", gpuKernel,
-             std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]");
-    PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
-             std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
-    PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
-             std::string("Using ") + std::to_string(numGpuDevices) + " GPU devices");
-    PRINT_EV("NUM_ITERATIONS", numIterations,
-             std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
-             + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
-    PRINT_EV("NUM_WARMUPS", numWarmups,
-             std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
-    PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
-             std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
-    PRINT_EV("USE_INTERACTIVE", useInteractive,
-             std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
-    PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
-             std::string("Use ") + (usePcieIndexing ? "PCIe" : "HIP") + " GPU device indexing");
-    PRINT_EV("USE_PREP_KERNEL", usePrepSrcKernel,
-             std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
-    PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
-             std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
-    PRINT_EV("VALIDATE_DIRECT", validateDirect,
-             std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
-    printf("\n");
-  };
-
-  // Display env var for P2P Benchmark preset
-  void DisplayP2PBenchmarkEnvVars() const
-  {
-    DisplayEnvVars();
-
-    if (!outputToCsv)
-      printf("[P2P Related]\n");
-
-    PRINT_EV("NUM_CPU_SE", numCpuSubExecs,
-             std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
-    PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
-             std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
-    PRINT_EV("USE_GPU_DMA", useDmaCopy,
-             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
-    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
-             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
-    printf("\n");
-  }
-
-  // Display env var settings
-  void DisplaySweepEnvVars() const
-  {
-    DisplayEnvVars();
-
-    if (!outputToCsv)
-      printf("[Sweep Related]\n");
-    PRINT_ES("SWEEP_DST", sweepDst.c_str(),
-             std::string("Destination Memory Types to sweep"));
-    PRINT_ES("SWEEP_EXE", sweepExe.c_str(),
-             std::string("Executor Types to sweep"));
-    PRINT_EV("SWEEP_MAX", sweepMax,
-             std::string("Max simultaneous transfers (0 = no limit)"));
-    PRINT_EV("SWEEP_MIN", sweepMin,
-             std::string("Min simultaenous transfers"));
-    PRINT_EV("SWEEP_RAND_BYTES", sweepRandBytes,
-             std::string("Using ") + (sweepRandBytes ? "random" : "constant") + " number of bytes per Transfer");
-    PRINT_EV("SWEEP_SEED", sweepSeed,
-             std::string("Random seed set to ") + std::to_string(sweepSeed));
-    PRINT_ES("SWEEP_SRC", sweepSrc.c_str(),
-             std::string("Source Memory Types to sweep"));
-    PRINT_EV("SWEEP_TEST_LIMIT", sweepTestLimit,
-             std::string("Max number of tests to run during sweep (0 = no limit)"));
-    PRINT_EV("SWEEP_TIME_LIMIT", sweepTimeLimit,
-             std::string("Max number of seconds to run sweep for  (0 = no limit)"));
-    PRINT_EV("SWEEP_XGMI_MAX", sweepXgmiMax,
-             std::string("Max number of XGMI hops for Transfers (-1 = no limit)"));
-    PRINT_EV("SWEEP_XGMI_MIN", sweepXgmiMin,
-             std::string("Min number of XGMI hops for Transfers"));
-    printf("\n");
-  }
-
-  // Helper function that gets parses environment variable or sets to default value
-  static int GetEnvVar(std::string const& varname, int defaultValue)
-  {
-    if (getenv(varname.c_str()))
-      return atoi(getenv(varname.c_str()));
-    return defaultValue;
-  }
-
-  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
-  {
-    if (getenv(varname.c_str()))
-      return getenv(varname.c_str());
-    return defaultValue;
-  }
-};
-
-#endif
diff --git a/transferbench.so/include/GetClosestNumaNode.hpp b/transferbench.so/include/GetClosestNumaNode.hpp
deleted file mode 100644
index 266f1eb1..00000000
--- a/transferbench.so/include/GetClosestNumaNode.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-// Helper macro for checking HSA calls
-#define HSA_CHECK(cmd)                                                  \
-  do {                                                                  \
-    hsa_status_t error = (cmd);                                         \
-    if (error != HSA_STATUS_SUCCESS) {                                  \
-      const char* errString = NULL;                                     \
-      hsa_status_string(error, &errString);                             \
-      std::cerr << "Encountered HSA error (" << errString << ") at line " \
-                << __LINE__ << " in file " << __FILE__ << "\n";         \
-      exit(-1);                                                         \
-    }                                                                   \
-  } while (0)
-
-// Structure to hold HSA agent information
-#if !defined(__NVCC__)
-struct AgentData
-{
-  bool isInitialized;
-  std::vector<hsa_agent_t> cpuAgents;
-  std::vector<hsa_agent_t> gpuAgents;
-  std::vector<int> closestNumaNode;
-};
-
-// Simple callback function to return any memory pool for an agent
-hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)
-{
-  hsa_amd_memory_pool_t* poolData = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
-
-  // Check memory pool flags
-  uint32_t poolFlags;
-  HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags));
-
-  // Only consider coarse-grained pools
-  if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS;
-
-  *poolData = pool;
-  return HSA_STATUS_SUCCESS;
-}
-
-// Callback function to gather HSA agent information
-hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data)
-{
-  AgentData* agentData = reinterpret_cast<AgentData*>(data);
-
-  // Get the device type
-  hsa_device_type_t deviceType;
-  HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType));
-  if (deviceType == HSA_DEVICE_TYPE_CPU)
-    agentData->cpuAgents.push_back(agent);
-  if (deviceType == HSA_DEVICE_TYPE_GPU)
-  {
-    agentData->gpuAgents.push_back(agent);
-    agentData->closestNumaNode.push_back(0);
-  }
-
-  return HSA_STATUS_SUCCESS;
-}
-
-AgentData& GetAgentData()
-{
-  static AgentData agentData = {};
-
-  if (!agentData.isInitialized)
-  {
-    agentData.isInitialized = true;
-
-    // Add all detected agents to the list
-    HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
-
-    // Loop over each GPU
-    for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++)
-    {
-      // Collect memory pool
-      hsa_amd_memory_pool_t pool;
-      HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
-
-      // Loop over each CPU agent and check distance
-      int bestDistance = -1;
-      for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++)
-      {
-        // Determine number of hops from GPU memory pool to CPU agent
-        uint32_t hops = 0;
-        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
-                                                     pool,
-                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
-                                                     &hops));
-        // Gather link info
-        hsa_amd_memory_pool_link_info_t* link_info =
-          (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
-        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
-                                                     pool,
-                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
-                                                     link_info));
-        int numaDist = 0;
-        for (int k = 0; k < hops; k++)
-        {
-          numaDist += link_info[k].numa_distance;
-        }
-        if (bestDistance == -1 || numaDist < bestDistance)
-        {
-          agentData.closestNumaNode[i] = j;
-          bestDistance = numaDist;
-        }
-        free(link_info);
-      }
-    }
-  }
-  return agentData;
-}
-#endif
-
-// Returns closest CPU NUMA node to provided GPU
-// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing
-int GetClosestNumaNode(int gpuIdx)
-{
-#if defined(__NVCC__)
-  return -1;
-#else
-  AgentData& agentData = GetAgentData();
-  if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size())
-  {
-    printf("[ERROR] GPU index out is out of bounds\n");
-    exit(1);
-  }
-  return agentData.closestNumaNode[gpuIdx];
-#endif
-}
diff --git a/transferbench.so/include/Kernels.hpp b/transferbench.so/include/Kernels.hpp
deleted file mode 100644
index dcb6bf07..00000000
--- a/transferbench.so/include/Kernels.hpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
-Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#pragma once
-
-#define PackedFloat_t   float4
-#define WARP_SIZE       64
-#define BLOCKSIZE       256
-#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
-#define MEMSET_CHAR     75
-#define MEMSET_VAL      13323083.0f
-
-// Each subExecutor is provided with subarrays to work on
-#define MAX_SRCS 16
-#define MAX_DSTS 16
-struct SubExecParam
-{
-  size_t    N;                                  // Number of floats this subExecutor works on
-  int       numSrcs;                            // Number of source arrays
-  int       numDsts;                            // Number of destination arrays
-  float*    src[MAX_SRCS];                      // Source array pointers
-  float*    dst[MAX_DSTS];                      // Destination array pointers
-  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
-  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
-};
-
-void CpuReduceKernel(SubExecParam const& p)
-{
-  int const& numSrcs = p.numSrcs;
-  int const& numDsts = p.numDsts;
-
-  if (numSrcs == 0)
-  {
-    for (int i = 0; i < numDsts; ++i)
-      memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
-  }
-  else if (numSrcs == 1)
-  {
-    float const* __restrict__ src = p.src[0];
-    for (int i = 0; i < numDsts; ++i)
-    {
-      memcpy(p.dst[i], src, p.N * sizeof(float));
-    }
-  }
-  else
-  {
-    for (int j = 0; j < p.N; j++)
-    {
-      float sum = p.src[0][j];
-      for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
-      for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
-    }
-  }
-}
-
-std::string PrepSrcValueString()
-{
-  return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
-}
-
-__host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
-{
-  return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
-}
-
-// GPU kernel to prepare src buffer data
-__global__ void
-PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
-{
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-       idx < N;
-       idx += blockDim.x * gridDim.x)
-  {
-    ptr[idx] = PrepSrcValue(srcBufferIdx, idx);
-  }
-}
-
-// Helper function for memset
-template <typename T> __device__ __forceinline__ T      MemsetVal();
-template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
-template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
-
-// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
-template <int LOOP1_UNROLL>
-__global__ void __launch_bounds__(BLOCKSIZE)
-GpuReduceKernel(SubExecParam* params)
-{
-  int64_t startCycle = wall_clock64();
-
-  // Operate on wavefront granularity
-  SubExecParam& p    = params[blockIdx.x];
-  int const numSrcs  = p.numSrcs;
-  int const numDsts  = p.numDsts;
-  int const waveId   = threadIdx.x / WARP_SIZE; // Wavefront number
-  int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
-
-  // 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration
-  // Determine the number of packed floats processed by the first loop
-  size_t       Nrem        = p.N;
-  size_t const loop1Npack  = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
-  size_t const loop1Nelem  = loop1Npack * FLOATS_PER_PACK;
-  size_t const loop1Inc    = BLOCKSIZE * LOOP1_UNROLL;
-  size_t       loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
-
-  while (loop1Offset < loop1Npack)
-  {
-    PackedFloat_t vals[LOOP1_UNROLL] = {};
-
-    if (numSrcs == 0)
-    {
-      #pragma unroll
-      for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal<float4>();
-    }
-    else
-    {
-      for (int i = 0; i < numSrcs; ++i)
-      {
-        PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset;
-        #pragma unroll
-        for (int u = 0; u < LOOP1_UNROLL; ++u)
-          vals[u] += *(packedSrc + u * WARP_SIZE);
-      }
-    }
-
-    for (int i = 0; i < numDsts; ++i)
-    {
-      PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset;
-      #pragma unroll
-      for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u];
-    }
-    loop1Offset += loop1Inc;
-  }
-  Nrem -= loop1Nelem;
-
-  if (Nrem > 0)
-  {
-    // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
-    // NOTE: Using int32_t due to smaller size requirements
-    int32_t const loop2Npack  = Nrem / FLOATS_PER_PACK;
-    int32_t const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
-    int32_t const loop2Inc    = BLOCKSIZE;
-    int32_t       loop2Offset = threadIdx.x;
-
-    while (loop2Offset < loop2Npack)
-    {
-      PackedFloat_t val;
-      if (numSrcs == 0)
-      {
-        val = MemsetVal<float4>();
-      }
-      else
-      {
-        val = {};
-        for (int i = 0; i < numSrcs; ++i)
-        {
-          PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset;
-          val += *packedSrc;
-        }
-      }
-
-      for (int i = 0; i < numDsts; ++i)
-      {
-        PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset;
-        *packedDst = val;
-      }
-      loop2Offset += loop2Inc;
-    }
-    Nrem -= loop2Nelem;
-
-    // Deal with leftovers less than FLOATS_PER_PACK)
-    if (threadIdx.x < Nrem)
-    {
-      int offset = loop1Nelem + loop2Nelem + threadIdx.x;
-      float val = 0;
-      if (numSrcs == 0)
-      {
-        val = MEMSET_VAL;
-      }
-      else
-      {
-        for (int i = 0; i < numSrcs; ++i)
-          val += p.src[i][offset];
-      }
-
-      for (int i = 0; i < numDsts; ++i)
-        p.dst[i][offset] = val;
-    }
-  }
-
-  __syncthreads();
-  if (threadIdx.x == 0)
-  {
-    p.startCycle = startCycle;
-    p.stopCycle  = wall_clock64();
-  }
-}
-
-template <typename FLOAT_TYPE, int UNROLL_FACTOR>
-__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
-{
-  int    constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
-  size_t constexpr loopPackInc      = BLOCKSIZE * UNROLL_FACTOR;
-  size_t constexpr numPacksPerWave  = WARP_SIZE * UNROLL_FACTOR;
-  int    const     waveId           = threadIdx.x / WARP_SIZE;            // Wavefront number
-  int    const     threadId         = threadIdx.x % WARP_SIZE;            // Thread index within wavefront
-  int    const     numSrcs          = p.numSrcs;
-  int    const     numDsts          = p.numDsts;
-  size_t const     numPacksDone     = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave;
-  size_t const     numFloatsLeft    = N - numPacksDone * numFloatsPerPack;
-  size_t           loopPackOffset   = waveId * numPacksPerWave + threadId;
-
-  while (loopPackOffset < numPacksDone)
-  {
-    FLOAT_TYPE vals[UNROLL_FACTOR];
-
-    if (numSrcs == 0)
-    {
-      #pragma unroll UNROLL_FACTOR
-      for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal<FLOAT_TYPE>();
-    }
-    else
-    {
-      FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset;
-      #pragma unroll UNROLL_FACTOR
-      for (int u = 0; u < UNROLL_FACTOR; ++u)
-        vals[u] = *(src0Ptr + u * WARP_SIZE);
-
-      for (int i = 1; i < numSrcs; ++i)
-      {
-        FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset;
-
-        #pragma unroll UNROLL_FACTOR
-        for (int u = 0; u < UNROLL_FACTOR; ++u)
-          vals[u] += *(srcPtr + u * WARP_SIZE);
-      }
-    }
-
-    for (int i = 0; i < numDsts; ++i)
-    {
-      FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset;
-      #pragma unroll UNROLL_FACTOR
-      for (int u = 0; u < UNROLL_FACTOR; ++u)
-        *(dstPtr + u * WARP_SIZE) = vals[u];
-    }
-    loopPackOffset += loopPackInc;
-  }
-
-  return numFloatsLeft;
-}
-
-template <typename FLOAT_TYPE, int UNROLL_FACTOR>
-__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N)
-{
-  // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
-  int    constexpr numFloatsPerRead      = sizeof(FLOAT_TYPE) / sizeof(float);
-  size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead;
-  size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
-  size_t const     numFloatsLeft         = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
-  size_t const     numFloatsDone         = N - numFloatsLeft;
-  int    const     numSrcs               = p.numSrcs;
-  int    const     numDsts               = p.numDsts;
-
-  for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop)
-  {
-    FLOAT_TYPE tmp[UNROLL_FACTOR];
-
-    if (numSrcs == 0)
-    {
-        #pragma unroll UNROLL_FACTOR
-        for (int u = 0; u < UNROLL_FACTOR; ++u)
-          tmp[u] = MemsetVal<FLOAT_TYPE>();
-    }
-    else
-    {
-      #pragma unroll UNROLL_FACTOR
-      for (int u = 0; u < UNROLL_FACTOR; ++u)
-        tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop]));
-
-      for (int i = 1; i < numSrcs; ++i)
-      {
-        #pragma unroll UNROLL_FACTOR
-        for (int u = 0; u < UNROLL_FACTOR; ++u)
-          tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop]));
-      }
-    }
-
-    for (int i = 0; i < numDsts; ++i)
-    {
-      for (int u = 0; u < UNROLL_FACTOR; ++u)
-      {
-        *((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u];
-      }
-    }
-  }
-  return numFloatsLeft;
-}
-
-template <typename FLOAT_TYPE>
-__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
-{
-  switch (unroll)
-  {
-  case  1: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
-  case  2: return GpuReduceFuncImpl<FLOAT_TYPE,  2>(p, offset, N);
-  case  3: return GpuReduceFuncImpl<FLOAT_TYPE,  3>(p, offset, N);
-  case  4: return GpuReduceFuncImpl<FLOAT_TYPE,  4>(p, offset, N);
-  case  5: return GpuReduceFuncImpl<FLOAT_TYPE,  5>(p, offset, N);
-  case  6: return GpuReduceFuncImpl<FLOAT_TYPE,  6>(p, offset, N);
-  case  7: return GpuReduceFuncImpl<FLOAT_TYPE,  7>(p, offset, N);
-  case  8: return GpuReduceFuncImpl<FLOAT_TYPE,  8>(p, offset, N);
-  case  9: return GpuReduceFuncImpl<FLOAT_TYPE,  9>(p, offset, N);
-  case 10: return GpuReduceFuncImpl<FLOAT_TYPE, 10>(p, offset, N);
-  case 11: return GpuReduceFuncImpl<FLOAT_TYPE, 11>(p, offset, N);
-  case 12: return GpuReduceFuncImpl<FLOAT_TYPE, 12>(p, offset, N);
-  case 13: return GpuReduceFuncImpl<FLOAT_TYPE, 13>(p, offset, N);
-  case 14: return GpuReduceFuncImpl<FLOAT_TYPE, 14>(p, offset, N);
-  case 15: return GpuReduceFuncImpl<FLOAT_TYPE, 15>(p, offset, N);
-  case 16: return GpuReduceFuncImpl<FLOAT_TYPE, 16>(p, offset, N);
-  default: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
-  }
-}
-
-// GPU copy kernel
-__global__ void __launch_bounds__(BLOCKSIZE)
-GpuReduceKernel2(SubExecParam* params)
-{
-  int64_t startCycle = wall_clock64();
-  SubExecParam& p = params[blockIdx.x];
-
-  size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
-  if (numFloatsLeft)
-    numFloatsLeft = GpuReduceFunc<float4>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
-
-  if (numFloatsLeft)
-  GpuReduceFunc<float>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
-
-  __threadfence_system();
-  if (threadIdx.x == 0)
-  {
-    p.startCycle = startCycle;
-    p.stopCycle  = wall_clock64();
-  }
-}
-
-#define NUM_GPU_KERNELS 18
-typedef void (*GpuKernelFuncPtr)(SubExecParam*);
-
-GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] =
-{
-  GpuReduceKernel<8>,
-  GpuReduceKernel<1>,
-  GpuReduceKernel<2>,
-  GpuReduceKernel<3>,
-  GpuReduceKernel<4>,
-  GpuReduceKernel<5>,
-  GpuReduceKernel<6>,
-  GpuReduceKernel<7>,
-  GpuReduceKernel<8>,
-  GpuReduceKernel<9>,
-  GpuReduceKernel<10>,
-  GpuReduceKernel<11>,
-  GpuReduceKernel<12>,
-  GpuReduceKernel<13>,
-  GpuReduceKernel<14>,
-  GpuReduceKernel<15>,
-  GpuReduceKernel<16>,
-  GpuReduceKernel2
-};
-
-std::string GpuKernelNames[NUM_GPU_KERNELS] =
-{
-  "Default - 8xUnroll",
-  "Unroll x1",
-  "Unroll x2",
-  "Unroll x3",
-  "Unroll x4",
-  "Unroll x5",
-  "Unroll x6",
-  "Unroll x7",
-  "Unroll x8",
-  "Unroll x9",
-  "Unroll x10",
-  "Unroll x11",
-  "Unroll x12",
-  "Unroll x13",
-  "Unroll x14",
-  "Unroll x15",
-  "Unroll x16",
-  "8xUnrollB",
-};
diff --git a/transferbench.so/include/TransferBench.hpp b/transferbench.so/include/TransferBench.hpp
deleted file mode 100644
index 6abbbc5b..00000000
--- a/transferbench.so/include/TransferBench.hpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
-Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <vector>
-#include <sstream>
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <cstdint>
-#include <set>
-#include <unistd.h>
-#include <map>
-#include <iostream>
-#include <sstream>
-
-#include "Compatibility.hpp"
-
-// Helper macro for catching HIP errors
-#define HIP_CALL(cmd)                                                                   \
-    do {                                                                                \
-        hipError_t error = (cmd);                                                       \
-        if (error != hipSuccess)                                                        \
-        {                                                                               \
-            std::cerr << "Encountered HIP error (" << hipGetErrorString(error)          \
-                      << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
-            exit(-1);                                                                   \
-        }                                                                               \
-    } while (0)
-
-#include "EnvVars.hpp"
-
-// Simple configuration parameters
-size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer
-
-// Different src/dst memory types supported
-typedef enum
-{
-  MEM_CPU          = 0, // Coarse-grained pinned CPU memory
-  MEM_GPU          = 1, // Coarse-grained global GPU memory
-  MEM_CPU_FINE     = 2, // Fine-grained pinned CPU memory
-  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
-  MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
-  MEM_NULL         = 5, // NULL memory - used for empty
-} MemType;
-
-typedef enum
-{
-  EXE_CPU          = 0, // CPU executor              (subExecutor = CPU thread)
-  EXE_GPU_GFX      = 1, // GPU kernel-based executor (subExecutor = threadblock/CU)
-  EXE_GPU_DMA      = 2, // GPU SDMA-based executor   (subExecutor = streams)
-} ExeType;
-
-bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); }
-bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
-bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
-bool IsCpuType(ExeType e) { return (e == EXE_CPU); };
-
-char const MemTypeStr[7] = "CGBFUN";
-char const ExeTypeStr[4] = "CGD";
-char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
-
-MemType inline CharToMemType(char const c)
-{
-  char const* val = strchr(MemTypeStr, toupper(c));
-  if (*val) return (MemType)(val - MemTypeStr);
-  printf("[ERROR] Unexpected memory type (%c)\n", c);
-  exit(1);
-}
-
-ExeType inline CharToExeType(char const c)
-{
-  char const* val = strchr(ExeTypeStr, toupper(c));
-  if (*val) return (ExeType)(val - ExeTypeStr);
-  printf("[ERROR] Unexpected executor type (%c)\n", c);
-  exit(1);
-}
-
-// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified)
-// then writes the summation to each of the specified destination memory location(s)
-struct Transfer
-{
-  int                       transferIndex;      // Transfer identifier (within a Test)
-  ExeType                   exeType;            // Transfer executor type
-  int                       exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
-  int                       numSubExecs;        // Number of subExecutors to use for this Transfer
-  size_t                    numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
-  size_t                    numBytesActual;     // Actual number of bytes to copy
-  double                    transferTime;       // Time taken in milliseconds
-
-  int                       numSrcs;            // Number of sources
-  std::vector<MemType>      srcType;            // Source memory types
-  std::vector<int>          srcIndex;           // Source device indice
-  std::vector<float*>       srcMem;             // Source memory
-
-  int                       numDsts;            // Number of destinations
-  std::vector<MemType>      dstType;            // Destination memory type
-  std::vector<int>          dstIndex;           // Destination device index
-  std::vector<float*>       dstMem;             // Destination memory
-
-  std::vector<SubExecParam> subExecParam;       // Defines subarrays assigned to each threadblock
-  SubExecParam*             subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
-
-  // Prepares src/dst subarray pointers for each SubExecutor
-  void PrepareSubExecParams(EnvVars const& ev);
-
-  // Prepare source arrays with input data
-  bool PrepareSrc(EnvVars const& ev);
-
-  // Validate that destination data contains expected results
-  void ValidateDst(EnvVars const& ev);
-
-  // Prepare reference buffers
-  void PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx);
-
-  // String representation functions
-  std::string SrcToStr() const;
-  std::string DstToStr() const;
-};
-
-struct ExecutorInfo
-{
-  std::vector<Transfer*>   transfers;        // Transfers to execute
-  size_t                   totalBytes;       // Total bytes this executor transfers
-  int                      totalSubExecs;    // Total number of subExecutors to use
-
-  // For GPU-Executors
-  SubExecParam*            subExecParamGpu;  // GPU copy of subExecutor parameters
-  std::vector<hipStream_t> streams;
-  std::vector<hipEvent_t>  startEvents;
-  std::vector<hipEvent_t>  stopEvents;
-
-  // Results
-  double totalTime;
-};
-
-typedef std::pair<ExeType, int> Executor;
-typedef std::map<Executor, ExecutorInfo> TransferMap;
-
-// Display usage instructions
-void DisplayUsage(char const* cmdName);
-
-// Display detected GPU topology / CPU numa nodes
-void DisplayTopology(bool const outputToCsv);
-
-// Build array of test sizes based on sampling factor
-void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor,
-                       std::vector<size_t>& valuesofN);
-
-void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
-                  std::vector<MemType>& memType, std::vector<int>& memIndex);
-void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
-                  ExeType& exeType, int& exeIndex);
-
-void ParseTransfers(char* line, int numCpus, int numGpus,
-                    std::vector<Transfer>& transfers);
-
-void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
-                      std::vector<Transfer>& transfers, bool verbose = true);
-
-void EnablePeerAccess(int const deviceId, int const peerDeviceId);
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
-void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
-void CheckPages(char* byteArray, size_t numBytes, int targetId);
-void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
-void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
-
-// Return the maximum bandwidth measured for given (src/dst) pair
-double GetPeakBandwidth(EnvVars const& ev, size_t  const  N,
-                        int     const  isBidirectional,
-                        MemType const  srcType, int const srcIndex,
-                        MemType const  dstType, int const dstIndex);
-
-std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
-
-int RemappedIndex(int const origIdx, bool const isCpuType);
-int GetWallClockRate(int deviceId);
-void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
-std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);
diff --git a/transferbench.so/include/action.h b/transferbench.so/include/action.h
deleted file mode 100644
index 7fdf2b35..00000000
--- a/transferbench.so/include/action.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/********************************************************************************
- *
- * Copyright (c) 2018 ROCm Developer Tools
- *
- * MIT LICENSE:
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
- * of the Software, and to permit persons to whom the Software is furnished to do
- * so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef HIPTEST_SO_INCLUDE_ACTION_H_
-#define HIPTEST_SO_INCLUDE_ACTION_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include <pci/pci.h>
-#ifdef __cplusplus
-}
-#endif
-
-#include <vector>
-#include <string>
-#include <map>
-
-#include "include/rvsactionbase.h"
-
-using std::vector;
-using std::string;
-using std::map;
-
-/**
- * @class tfb_action
- * @ingroup GST
- *
- * @brief GST action implementation class
- *
- * Derives from rvs::actionbase and implements actual action functionality
- * in its run() method.
- *
- */
-class tfb_action: public rvs::actionbase {
- public:
-    tfb_action();
-    virtual ~tfb_action();
-
-    virtual int run(void);
-
-    std::string m_transfer_type;
-
- protected:
-    //! TRUE if JSON output is required
-    bool bjson;
-
-    bool get_all_tfb_config_keys(void);
-  /**
-  * @brief reads all common configuration keys from
-  * the module's properties collection
-  * @return true if no fatal error occured, false otherwise
-  */
-    bool get_all_common_config_keys(void);
-
-    int get_num_amd_gpu_devices(void);
-    bool start_tfb_runners();
-  /**
-  * @brief gets the number of ROCm compatible AMD GPUs
-  * @return run number of GPUs
-  */
-    int run_transferbench(void);
-};
-
-#endif  // HIPTEST_SO_INCLUDE_ACTION_H_
diff --git a/transferbench.so/include/rvs_module.h b/transferbench.so/include/rvs_module.h
deleted file mode 100644
index 9edaec65..00000000
--- a/transferbench.so/include/rvs_module.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/********************************************************************************
- *
- * Copyright (c) 2018 ROCm Developer Tools
- *
- * MIT LICENSE:
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
- * of the Software, and to permit persons to whom the Software is furnished to do
- * so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef HIPTEST_SO_INCLUDE_RVS_MODULE_H_
-#define HIPTEST_SO_INCLUDE_RVS_MODULE_H_
-
-#include "include/rvsliblog.h"
-
-
-#endif  // HIPTEST_SO_INCLUDE_RVS_MODULE_H_
diff --git a/transferbench.so/include/tfb_worker.h b/transferbench.so/include/tfb_worker.h
deleted file mode 100644
index 10b1cfac..00000000
--- a/transferbench.so/include/tfb_worker.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/********************************************************************************
- *
- * Copyright (c) 2018 ROCm Developer Tools
- *
- * MIT LICENSE:
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
- * of the Software, and to permit persons to whom the Software is furnished to do
- * so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
-#define HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
-
-#include <string>
-#include <memory>
-#include "include/rvsthreadbase.h"
-#include "include/rvs_blas.h"
-#include "include/rvs_util.h"
-
-#define GST_RESULT_PASS_MESSAGE         "true"
-#define GST_RESULT_FAIL_MESSAGE         "false"
-
-
-/**
- * @class tfbWorker
- * @ingroup GST
- *
- * @brief tfbWorker action implementation class
- *
- * Derives from rvs::ThreadBase and implements actual action functionality
- * in its run() method.
- *
- */
-class tfbWorker : public rvs::ThreadBase {
- public:
-    tfbWorker();
-    virtual ~tfbWorker();
-
-    //! sets action name
-    void set_name(const std::string& name) { action_name = name; }
-    //! returns action name
-    const std::string& get_name(void) { return action_name; }
-
-    //! sets test path
-    void set_transfer_type(std::string type) { m_tfr_type = type; }
-
-    const std::string& get_transfer_type(void) { return m_tfr_type; }
-    int TfbRun(int &error, std::string &errdesc);
- protected:
-    virtual void run(void);
- protected:
-    //! name of the action
-    std::string action_name;
-    //! path to execute test
-    std::string m_tfr_type;
-};
-
-#endif  // HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
diff --git a/transferbench.so/src/.gitignore b/transferbench.so/src/.gitignore
deleted file mode 100644
index 6677c873..00000000
--- a/transferbench.so/src/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/libmain.cpp
diff --git a/transferbench.so/src/TransferBench.cpp b/transferbench.so/src/TransferBench.cpp
deleted file mode 100644
index 51c7f68f..00000000
--- a/transferbench.so/src/TransferBench.cpp
+++ /dev/null
@@ -1,1814 +0,0 @@
-/*
-Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-// This program measures simultaneous copy performance across multiple GPUs
-// on the same node
-#include <numa.h>
-#include <numaif.h>
-#include <random>
-#include <stack>
-#include <thread>
-
-#include "TransferBench.hpp"
-#include "GetClosestNumaNode.hpp"
-
-
-
-void ExecuteTransfers(EnvVars const& ev,
-                      int const testNum,
-                      size_t const N,
-                      std::vector<Transfer>& transfers,
-                      bool verbose)
-{
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  // Map transfers by executor
-  TransferMap transferMap;
-  for (Transfer& transfer : transfers)
-  {
-    Executor executor(transfer.exeType, transfer.exeIndex);
-    ExecutorInfo& executorInfo = transferMap[executor];
-    executorInfo.transfers.push_back(&transfer);
-  }
-
-  // Loop over each executor and prepare sub-executors
-  std::map<int, Transfer*> transferList;
-  for (auto& exeInfoPair : transferMap)
-  {
-    Executor const& executor = exeInfoPair.first;
-    ExecutorInfo& exeInfo    = exeInfoPair.second;
-    ExeType const exeType    = executor.first;
-    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
-
-    exeInfo.totalTime = 0.0;
-    exeInfo.totalSubExecs = 0;
-
-    // Loop over each transfer this executor is involved in
-    for (Transfer* transfer : exeInfo.transfers)
-    {
-      // Determine how many bytes to copy for this Transfer (use custom if pre-specified)
-      transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));
-
-      // Allocate source memory
-      transfer->srcMem.resize(transfer->numSrcs);
-      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
-      {
-        MemType const& srcType  = transfer->srcType[iSrc];
-        int     const  srcIndex    = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
-
-        // Ensure executing GPU can access source memory
-        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex)
-          EnablePeerAccess(exeIndex, srcIndex);
-
-        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
-      }
-
-      // Allocate destination memory
-      transfer->dstMem.resize(transfer->numDsts);
-      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
-      {
-        MemType const& dstType  = transfer->dstType[iDst];
-        int     const  dstIndex    = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
-
-        // Ensure executing GPU can access destination memory
-        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex)
-          EnablePeerAccess(exeIndex, dstIndex);
-
-        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
-      }
-
-      exeInfo.totalSubExecs += transfer->numSubExecs;
-      transferList[transfer->transferIndex] = transfer;
-    }
-
-    // Prepare additional requirement for GPU-based executors
-    if (IsGpuType(exeType))
-    {
-      HIP_CALL(hipSetDevice(exeIndex));
-
-      // Single-stream is only supported for GFX-based executors
-      int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
-      exeInfo.streams.resize(numStreamsToUse);
-      exeInfo.startEvents.resize(numStreamsToUse);
-      exeInfo.stopEvents.resize(numStreamsToUse);
-      for (int i = 0; i < numStreamsToUse; ++i)
-      {
-        HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
-        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
-        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
-      }
-
-      if (exeType == EXE_GPU_GFX)
-      {
-        // Allocate one contiguous chunk of GPU memory for threadblock parameters
-        // This allows support for executing one transfer per stream, or all transfers in a single stream
-        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
-                       (void**)&exeInfo.subExecParamGpu);
-      }
-    }
-  }
-
-  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
-
-  // Prepare input memory and block parameters for current N
-  bool isSrcCorrect = true;
-  for (auto& exeInfoPair : transferMap)
-  {
-    Executor const& executor = exeInfoPair.first;
-    ExecutorInfo& exeInfo    = exeInfoPair.second;
-    ExeType const exeType    = executor.first;
-    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
-
-    exeInfo.totalBytes = 0;
-
-    int transferOffset = 0;
-    for (int i = 0; i < exeInfo.transfers.size(); ++i)
-    {
-      // Prepare subarrays each threadblock works on and fill src memory with patterned data
-      Transfer* transfer = exeInfo.transfers[i];
-      transfer->PrepareSubExecParams(ev);
-      isSrcCorrect &= transfer->PrepareSrc(ev);
-      exeInfo.totalBytes += transfer->numBytesActual;
-
-      // Copy block parameters to GPU for GPU executors
-      if (transfer->exeType == EXE_GPU_GFX)
-      {
-        exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
-        HIP_CALL(hipSetDevice(exeIndex));
-        HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset],
-                           transfer->subExecParam.data(),
-                           transfer->subExecParam.size() * sizeof(SubExecParam),
-                           hipMemcpyHostToDevice));
-        HIP_CALL(hipDeviceSynchronize());
-
-        transferOffset += transfer->subExecParam.size();
-      }
-    }
-  }
-
-  // Launch kernels (warmup iterations are not counted)
-  double totalCpuTime = 0;
-  size_t numTimedIterations = 0;
-  std::stack<std::thread> threads;
-  for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
-  {
-    if (ev.numIterations > 0 && iteration    >= ev.numIterations) break;
-    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
-
-    // Pause before starting first timed iteration in interactive mode
-    if (verbose && ev.useInteractive && iteration == 0)
-    {
-      printf("Memory prepared:\n");
-
-      for (Transfer& transfer : transfers)
-      {
-        printf("Transfer %03d:\n", transfer.transferIndex);
-        for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
-          printf("  SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
-        for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
-          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
-      }
-      printf("Hit <Enter> to continue: ");
-      if (scanf("%*c") != 0)
-      {
-        printf("[ERROR] Unexpected input\n");
-        exit(1);
-      }
-      printf("\n");
-    }
-
-    // Start CPU timing for this iteration
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-
-    // Execute all Transfers in parallel
-    for (auto& exeInfoPair : transferMap)
-    {
-      ExecutorInfo& exeInfo = exeInfoPair.second;
-      ExeType       exeType = exeInfoPair.first.first;
-      int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();
-
-      for (int i = 0; i < numTransfersToRun; ++i)
-        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
-    }
-
-    // Wait for all threads to finish
-    int const numTransfers = threads.size();
-    for (int i = 0; i < numTransfers; i++)
-    {
-      threads.top().join();
-      threads.pop();
-    }
-
-    // Stop CPU timing for this iteration
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-
-    if (iteration >= 0)
-    {
-      ++numTimedIterations;
-      totalCpuTime += deltaSec;
-    }
-  }
-
-  // Pause for interactive mode
-  if (verbose && isSrcCorrect && ev.useInteractive)
-  {
-    printf("Transfers complete. Hit <Enter> to continue: ");
-    if (scanf("%*c") != 0)
-    {
-      printf("[ERROR] Unexpected input\n");
-      exit(1);
-    }
-    printf("\n");
-  }
-
-  // Validate that each transfer has transferred correctly
-  size_t totalBytesTransferred = 0;
-  int const numTransfers = transferList.size();
-  for (auto transferPair : transferList)
-  {
-    Transfer* transfer = transferPair.second;
-    transfer->ValidateDst(ev);
-    totalBytesTransferred += transfer->numBytesActual;
-  }
-
-  // Report timings
-  totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
-  double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
-  double maxGpuTime = 0;
-
-  if (!isSrcCorrect) goto cleanup;
-  if (ev.useSingleStream)
-  {
-    for (auto& exeInfoPair : transferMap)
-    {
-      ExecutorInfo  exeInfo  = exeInfoPair.second;
-      ExeType const exeType  = exeInfoPair.first.first;
-      int     const exeIndex = exeInfoPair.first.second;
-
-      // Compute total time for non GPU executors
-      if (exeType != EXE_GPU_GFX)
-      {
-        exeInfo.totalTime = 0;
-        for (auto const& transfer : exeInfo.transfers)
-          exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
-      }
-
-      double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
-      double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
-      maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
-
-      if (verbose && !ev.outputToCsv)
-      {
-        printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n",
-               ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
-      }
-
-      int totalCUs = 0;
-      for (auto const& transfer : exeInfo.transfers)
-      {
-        double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
-        double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
-        totalCUs += transfer->numSubExecs;
-
-        if (!verbose) continue;
-        if (!ev.outputToCsv)
-        {
-          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
-                 transfer->transferIndex,
-                 transferBandwidthGbs,
-                 transferDurationMsec,
-                 transfer->numBytesActual,
-                 transfer->SrcToStr().c_str(),
-                 ExeTypeName[transfer->exeType], transfer->exeIndex,
-                 transfer->numSubExecs,
-                 transfer->DstToStr().c_str());
-        }
-        else
-        {
-          printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n",
-                 testNum, transfer->transferIndex, transfer->numBytesActual,
-                 transfer->SrcToStr().c_str(),
-                 MemTypeStr[transfer->exeType], transfer->exeIndex,
-                 transfer->DstToStr().c_str(),
-                 transfer->numSubExecs,
-                 transferBandwidthGbs, transferDurationMsec,
-                 PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
-                 PtrVectorToStr(transfer->dstMem, initOffset).c_str());
-        }
-      }
-
-      if (verbose && ev.outputToCsv)
-      {
-        printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n",
-               testNum, totalBytesTransferred,
-               MemTypeStr[exeType], exeIndex, totalCUs,
-               exeBandwidthGbs, exeDurationMsec);
-      }
-    }
-  }
-  else
-  {
-    for (auto const& transferPair : transferList)
-    {
-      Transfer* transfer = transferPair.second;
-      double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
-      double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
-      maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
-      if (!verbose) continue;
-      if (!ev.outputToCsv)
-      {
-        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
-               transfer->transferIndex,
-               transferBandwidthGbs, transferDurationMsec,
-               transfer->numBytesActual,
-               transfer->SrcToStr().c_str(),
-               ExeTypeName[transfer->exeType], transfer->exeIndex,
-               transfer->numSubExecs,
-               transfer->DstToStr().c_str());
-      }
-      else
-      {
-        printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n",
-               testNum, transfer->transferIndex, transfer->numBytesActual,
-               transfer->SrcToStr().c_str(),
-               ExeTypeName[transfer->exeType], transfer->exeIndex,
-               transfer->DstToStr().c_str(),
-               transfer->numSubExecs,
-               transferBandwidthGbs, transferDurationMsec,
-               PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
-               PtrVectorToStr(transfer->dstMem, initOffset).c_str());
-      }
-    }
-  }
-
-  // Display aggregate statistics
-  if (verbose)
-  {
-    if (!ev.outputToCsv)
-    {
-      printf(" Aggregate (CPU)  | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
-             totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
-    }
-    else
-    {
-      printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n",
-             testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
-    }
-  }
-
-  // Release GPU memory
-cleanup:
-  for (auto exeInfoPair : transferMap)
-  {
-    ExecutorInfo& exeInfo  = exeInfoPair.second;
-    ExeType const exeType  = exeInfoPair.first.first;
-    int     const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));
-
-    for (auto& transfer : exeInfo.transfers)
-    {
-      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
-      {
-        MemType const& srcType = transfer->srcType[iSrc];
-        DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
-      }
-      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
-      {
-        MemType const& dstType = transfer->dstType[iDst];
-        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
-      }
-      transfer->subExecParam.clear();
-    }
-
-    if (IsGpuType(exeType))
-    {
-      int const numStreams = (int)exeInfo.streams.size();
-      for (int i = 0; i < numStreams; ++i)
-      {
-        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
-        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
-        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
-      }
-
-      if (exeType == EXE_GPU_GFX)
-      {
-        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
-      }
-    }
-  }
-}
-
-void DisplayUsage(char const* cmdName)
-{
-  printf("TransferBench v%s\n", TB_VERSION);
-  printf("========================================\n");
-
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  int const numCpuDevices = numa_num_configured_nodes();
-
-  printf("Usage: %s config <N>\n", cmdName);
-  printf("  config: Either:\n");
-  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
-  printf("          - Name of preset config:\n");
-  printf("              p2p          - Peer-to-peer benchmark tests\n");
-  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
-  printf("                             - 3rd/4th optional args for # GPU SubExecs / # CPU SubExecs per Transfer\n");
-  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
-  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
-         DEFAULT_BYTES_PER_TRANSFER);
-  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
-  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
-  printf("\n");
-
-  EnvVars::DisplayUsage();
-}
-
-int RemappedIndex(int const origIdx, bool const isCpuType)
-{
-  static std::vector<int> remappingCpu;
-  static std::vector<int> remappingGpu;
-
-  // Build CPU remapping on first use
-  // Skip numa nodes that are not configured
-  if (remappingCpu.empty())
-  {
-    for (int node = 0; node <= numa_max_node(); node++)
-      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
-        remappingCpu.push_back(node);
-  }
-
-  // Build remappingGpu on first use
-  if (remappingGpu.empty())
-  {
-    int numGpuDevices;
-    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-    remappingGpu.resize(numGpuDevices);
-
-    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
-    if (!usePcieIndexing)
-    {
-      // For HIP-based indexing no remappingGpu is necessary
-      for (int i = 0; i < numGpuDevices; ++i)
-        remappingGpu[i] = i;
-    }
-    else
-    {
-      // Collect PCIe address for each GPU
-      std::vector<std::pair<std::string, int>> mapping;
-      char pciBusId[20];
-      for (int i = 0; i < numGpuDevices; ++i)
-      {
-        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
-        mapping.push_back(std::make_pair(pciBusId, i));
-      }
-      // Sort GPUs by PCIe address then use that as mapping
-      std::sort(mapping.begin(), mapping.end());
-      for (int i = 0; i < numGpuDevices; ++i)
-        remappingGpu[i] = mapping[i].second;
-    }
-  }
-  return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
-}
-
-void DisplayTopology(bool const outputToCsv)
-{
-
-  int numCpuDevices = numa_num_configured_nodes();
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-
-  if (outputToCsv)
-  {
-    printf("NumCpus,%d\n", numCpuDevices);
-    printf("NumGpus,%d\n", numGpuDevices);
-  }
-  else
-  {
-    printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total]   %d GPU device(s)\n",
-           numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
-  }
-
-  // Print out detected CPU topology
-  if (outputToCsv)
-  {
-    printf("NUMA");
-    for (int j = 0; j < numCpuDevices; j++)
-      printf(",NUMA%02d", j);
-    printf(",# CPUs,ClosestGPUs,ActualNode\n");
-  }
-  else
-  {
-    printf("            |");
-    for (int j = 0; j < numCpuDevices; j++)
-      printf("NUMA %02d|", j);
-    printf(" #Cpus | Closest GPU(s)\n");
-
-    printf("------------+");
-    for (int j = 0; j <= numCpuDevices; j++)
-      printf("-------+");
-    printf("---------------\n");
-  }
-
-  for (int i = 0; i < numCpuDevices; i++)
-  {
-    int nodeI = RemappedIndex(i, true);
-    printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
-    for (int j = 0; j < numCpuDevices; j++)
-    {
-      int nodeJ = RemappedIndex(j, true);
-      int numaDist = numa_distance(nodeI, nodeJ);
-      if (outputToCsv)
-        printf("%d,", numaDist);
-      else
-        printf(" %5d |", numaDist);
-    }
-
-    int numCpus = 0;
-    for (int j = 0; j < numa_num_configured_cpus(); j++)
-      if (numa_node_of_cpu(j) == nodeI) numCpus++;
-    if (outputToCsv)
-      printf("%d,", numCpus);
-    else
-      printf(" %5d | ", numCpus);
-
-#if !defined(__NVCC__)
-    bool isFirst = true;
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
-      {
-        if (isFirst) isFirst = false;
-        else printf(",");
-        printf("%d", j);
-      }
-    }
-#endif
-    printf("\n");
-  }
-  printf("\n");
-
-#if defined(__NVCC__)
-  // No further topology detection done for NVIDIA platforms
-  return;
-#endif
-
-  // Print out detected GPU topology
-  if (outputToCsv)
-  {
-    printf("GPU");
-    for (int j = 0; j < numGpuDevices; j++)
-      printf(",GPU %02d", j);
-    printf(",PCIe Bus ID,ClosestNUMA\n");
-  }
-  else
-  {
-    printf("        |");
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, j));
-      std::string fullName = prop.gcnArchName;
-      std::string archName = fullName.substr(0, fullName.find(':'));
-      printf(" %6s |", archName.c_str());
-    }
-    printf("\n");
-    printf("        |");
-    for (int j = 0; j < numGpuDevices; j++)
-      printf(" GPU %02d |", j);
-    printf(" PCIe Bus ID  | #CUs | Closest NUMA\n");
-    for (int j = 0; j <= numGpuDevices; j++)
-      printf("--------+");
-    printf("--------------+------+-------------\n");
-  }
-
-#if !defined(__NVCC__)
-  char pciBusId[20];
-  for (int i = 0; i < numGpuDevices; i++)
-  {
-    int const deviceIdx = RemappedIndex(i, false);
-    printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      if (i == j)
-      {
-        if (outputToCsv)
-          printf("-,");
-        else
-          printf("    -   |");
-      }
-      else
-      {
-        uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
-                                              RemappedIndex(j, false),
-                                              &linkType, &hopCount));
-        printf("%s%s-%d%s",
-               outputToCsv ? "" : " ",
-               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
-               hopCount, outputToCsv ? "," : " |");
-      }
-    }
-    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));
-
-    int numDeviceCUs = 0;
-    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
-
-    if (outputToCsv)
-      printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
-    else
-      printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
-  }
-#endif
-}
-
-void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
-                  std::vector<MemType>& memTypes, std::vector<int>& memIndices)
-{
-  char typeChar;
-  int offset = 0, devIndex, inc;
-  bool found = false;
-
-  memTypes.clear();
-  memIndices.clear();
-  while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
-  {
-    offset += inc;
-    MemType memType = CharToMemType(typeChar);
-
-    if (IsCpuType(memType) && (devIndex < 0 || devIndex >= numCpus))
-    {
-      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, devIndex);
-      exit(1);
-    }
-    if (IsGpuType(memType) && (devIndex < 0 || devIndex >= numGpus))
-    {
-      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, devIndex);
-      exit(1);
-    }
-
-    found = true;
-    if (memType != MEM_NULL)
-    {
-      memTypes.push_back(memType);
-      memIndices.push_back(devIndex);
-    }
-  }
-  if (!found)
-  {
-    printf("[ERROR] Unable to parse memory type token %s.  Expected one of %s followed by an index\n",
-           token.c_str(), MemTypeStr);
-    exit(1);
-  }
-}
-
-void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
-                  ExeType &exeType, int& exeIndex)
-{
-  char typeChar;
-  if (sscanf(token.c_str(), " %c%d", &typeChar, &exeIndex) != 2)
-  {
-    printf("[ERROR] Unable to parse valid executor token (%s).  Exepected one of %s followed by an index\n",
-           token.c_str(), ExeTypeStr);
-    exit(1);
-  }
-  exeType = CharToExeType(typeChar);
-
-  if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= numCpus))
-  {
-    printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, exeIndex);
-    exit(1);
-  }
-  if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= numGpus))
-  {
-    printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, exeIndex);
-    exit(1);
-  }
-}
-
-// Helper function to parse a list of Transfer definitions
-void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
-{
-  // Replace any round brackets or '->' with spaces,
-  for (int i = 1; line[i]; i++)
-    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
-
-  transfers.clear();
-
-  int numTransfers = 0;
-  std::istringstream iss(line);
-  iss >> numTransfers;
-  if (iss.fail()) return;
-
-  std::string exeMem;
-  std::string srcMem;
-  std::string dstMem;
-
-  // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
-  // otherwise read triples (srcMem, exeMem, dstMem)
-  bool const advancedMode = (numTransfers < 0);
-  numTransfers = abs(numTransfers);
-
-  int numSubExecs;
-  if (!advancedMode)
-  {
-    iss >> numSubExecs;
-    if (numSubExecs <= 0 || iss.fail())
-    {
-      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs);
-      exit(1);
-    }
-  }
-
-  size_t numBytes = 0;
-  for (int i = 0; i < numTransfers; i++)
-  {
-    Transfer transfer;
-    transfer.transferIndex = i;
-    transfer.numBytes = 0;
-    transfer.numBytesActual = 0;
-    if (!advancedMode)
-    {
-      iss >> srcMem >> exeMem >> dstMem;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
-        exit(1);
-      }
-    }
-    else
-    {
-      std::string numBytesToken;
-      iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
-        exit(1);
-      }
-      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
-      {
-        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
-        exit(1);
-      }
-      char units = numBytesToken.back();
-      switch (toupper(units))
-      {
-      case 'K': numBytes *= 1024; break;
-      case 'M': numBytes *= 1024*1024; break;
-      case 'G': numBytes *= 1024*1024*1024; break;
-      }
-    }
-
-    ParseMemType(srcMem, numCpus, numGpus, transfer.srcType, transfer.srcIndex);
-    ParseMemType(dstMem, numCpus, numGpus, transfer.dstType, transfer.dstIndex);
-    ParseExeType(exeMem, numCpus, numGpus, transfer.exeType, transfer.exeIndex);
-
-    transfer.numSrcs = (int)transfer.srcType.size();
-    transfer.numDsts = (int)transfer.dstType.size();
-    if (transfer.numSrcs == 0 && transfer.numDsts == 0)
-    {
-      printf("[ERROR] Transfer must have at least one src or dst\n");
-      exit(1);
-    }
-
-    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
-    {
-      printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
-      exit(1);
-    }
-
-    transfer.numSubExecs = numSubExecs;
-    transfer.numBytes = numBytes;
-    transfers.push_back(transfer);
-  }
-}
-
-void EnablePeerAccess(int const deviceId, int const peerDeviceId)
-{
-  int canAccess;
-  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
-  if (!canAccess)
-  {
-    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
-    exit(1);
-  }
-  HIP_CALL(hipSetDevice(deviceId));
-  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
-  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
-  {
-    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
-           deviceId, peerDeviceId, hipGetErrorString(error));
-    exit(1);
-  }
-}
-
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
-{
-  if (numBytes == 0)
-  {
-    printf("[ERROR] Unable to allocate 0 bytes\n");
-    exit(1);
-  }
-  *memPtr = nullptr;
-  if (IsCpuType(memType))
-  {
-    // Set numa policy prior to call to hipHostMalloc
-    numa_set_preferred(devIndex);
-
-    // Allocate host-pinned memory (should respect NUMA mem policy)
-    if (memType == MEM_CPU_FINE)
-    {
-#if defined (__NVCC__)
-      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
-      exit(1);
-#else
-      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
-#endif
-    }
-    else if (memType == MEM_CPU)
-    {
-#if defined (__NVCC__)
-      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
-#else
-      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
-#endif
-      {
-        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
-        exit(1);
-      }
-    }
-    else if (memType == MEM_CPU_UNPINNED)
-    {
-      *memPtr = numa_alloc_onnode(numBytes, devIndex);
-    }
-
-    // Check that the allocated pages are actually on the correct NUMA node
-    memset(*memPtr, 0, numBytes);
-    CheckPages((char*)*memPtr, numBytes, devIndex);
-
-    // Reset to default numa mem policy
-    numa_set_preferred(-1);
-  }
-  else if (IsGpuType(memType))
-  {
-    if (memType == MEM_GPU)
-    {
-      // Allocate GPU memory on appropriate device
-      HIP_CALL(hipSetDevice(devIndex));
-      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
-    }
-    else if (memType == MEM_GPU_FINE)
-    {
-#if defined (__NVCC__)
-      printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
-      exit(1);
-#else
-      HIP_CALL(hipSetDevice(devIndex));
-      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
-
-#endif
-    }
-    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
-    HIP_CALL(hipDeviceSynchronize());
-  }
-  else
-  {
-    printf("[ERROR] Unsupported memory type %d\n", memType);
-    exit(1);
-  }
-}
-
-void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
-{
-  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null CPU pointer for %lu bytes.  Skipping hipHostFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipHostFree(memPtr));
-  }
-  else if (memType == MEM_CPU_UNPINNED)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes.  Skipping numa_free\n", bytes);
-      return;
-    }
-    numa_free(memPtr, bytes);
-  }
-  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipFree(memPtr));
-  }
-}
-
-void CheckPages(char* array, size_t numBytes, int targetId)
-{
-  unsigned long const pageSize = getpagesize();
-  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
-
-  std::vector<void *> pages(numPages);
-  std::vector<int> status(numPages);
-
-  pages[0] = array;
-  for (int i = 1; i < numPages; i++)
-  {
-    pages[i] = (char*)pages[i-1] + pageSize;
-  }
-
-  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
-  if (retCode)
-  {
-    printf("[ERROR] Unable to collect page info\n");
-    exit(1);
-  }
-
-  size_t mistakeCount = 0;
-  for (int i = 0; i < numPages; i++)
-  {
-    if (status[i] < 0)
-    {
-      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
-      exit(1);
-    }
-    if (status[i] != targetId) mistakeCount++;
-  }
-  if (mistakeCount > 0)
-  {
-    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
-    exit(1);
-  }
-}
-
-void RunTransfer(EnvVars const& ev, int const iteration,
-                 ExecutorInfo& exeInfo, int const transferIdx)
-{
-  Transfer* transfer = exeInfo.transfers[transferIdx];
-
-  if (transfer->exeType == EXE_GPU_GFX)
-  {
-    // Switch to executing GPU
-    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-    HIP_CALL(hipSetDevice(exeIndex));
-
-    hipStream_t& stream     = exeInfo.streams[transferIdx];
-    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
-    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
-
-    // Figure out how many threadblocks to use.
-    // In single stream mode, all the threadblocks for this GPU are launched
-    // Otherwise, just launch the threadblocks associated with this single Transfer
-    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
-#if defined(__NVCC__)
-    HIP_CALL(hipEventRecord(startEvent, stream));
-    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
-    HIP_CALL(hipEventRecord(stopEvent, stream));
-#else
-    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
-                          dim3(numBlocksToRun, 1, 1),
-                          dim3(BLOCKSIZE, 1, 1),
-                          ev.sharedMemBytes, stream,
-                          startEvent, stopEvent,
-                          0, transfer->subExecParamGpuPtr);
-#endif
-    // Synchronize per iteration, unless in single sync mode, in which case
-    // synchronize during last warmup / last actual iteration
-    HIP_CALL(hipStreamSynchronize(stream));
-
-    if (iteration >= 0)
-    {
-      // Record GPU timing
-      float gpuDeltaMsec;
-      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-
-      if (ev.useSingleStream)
-      {
-        // Figure out individual timings for Transfers that were all launched together
-        for (Transfer* currTransfer : exeInfo.transfers)
-        {
-          long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle;
-          long long maxStopCycle  = currTransfer->subExecParamGpuPtr[0].stopCycle;
-          for (int i = 1; i < currTransfer->numSubExecs; i++)
-          {
-            minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle);
-            maxStopCycle  = std::max(maxStopCycle,  currTransfer->subExecParamGpuPtr[i].stopCycle);
-          }
-          int const wallClockRate = GetWallClockRate(exeIndex);
-          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
-          currTransfer->transferTime += iterationTimeMs;
-        }
-        exeInfo.totalTime += gpuDeltaMsec;
-      }
-      else
-      {
-        transfer->transferTime += gpuDeltaMsec;
-      }
-    }
-  }
-  else if (transfer->exeType == EXE_GPU_DMA)
-  {
-    // Switch to executing GPU
-    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-    HIP_CALL(hipSetDevice(exeIndex));
-
-    hipStream_t& stream     = exeInfo.streams[transferIdx];
-    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
-    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
-
-    HIP_CALL(hipEventRecord(startEvent, stream));
-    if (transfer->numSrcs == 0 && transfer->numDsts == 1)
-    {
-      HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
-                              MEMSET_CHAR, transfer->numBytesActual, stream));
-    }
-    else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
-    {
-      HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
-                              transfer->numBytesActual, hipMemcpyDefault,
-                              stream));
-    }
-    HIP_CALL(hipEventRecord(stopEvent, stream));
-    HIP_CALL(hipStreamSynchronize(stream));
-
-    if (iteration >= 0)
-    {
-      // Record GPU timing
-      float gpuDeltaMsec;
-      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-      transfer->transferTime += gpuDeltaMsec;
-    }
-  }
-  else if (transfer->exeType == EXE_CPU) // CPU execution agent
-  {
-    // Force this thread and all child threads onto correct NUMA node
-    int const exeIndex = RemappedIndex(transfer->exeIndex, true);
-    if (numa_run_on_node(exeIndex))
-    {
-      printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
-      exit(1);
-    }
-
-    std::vector<std::thread> childThreads;
-
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-
-    // Launch each subExecutor in child-threads to perform memcopies
-    for (int i = 0; i < transfer->numSubExecs; ++i)
-      childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
-
-    // Wait for child-threads to finish
-    for (int i = 0; i < transfer->numSubExecs; ++i)
-      childThreads[i].join();
-
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-
-    // Record time if not a warmup iteration
-    if (iteration >= 0)
-      transfer->transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
-  }
-}
-
-void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
-{
-  ev.DisplayP2PBenchmarkEnvVars();
-
-  // Collect the number of available CPUs/GPUs on this machine
-  int const numCpus    = ev.numCpuDevices;
-  int const numGpus    = ev.numGpuDevices;
-  int const numDevices = numCpus + numGpus;
-
-  // Enable peer to peer for each GPU
-  for (int i = 0; i < numGpus; i++)
-    for (int j = 0; j < numGpus; j++)
-      if (i != j) EnablePeerAccess(i, j);
-
-  // Perform unidirectional / bidirectional
-  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
-  {
-    // Print header
-    if (!ev.outputToCsv)
-    {
-      printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
-             ev.useRemoteRead ? "Remote" : "Local",
-             ev.useRemoteRead ? "Local" : "Remote",
-             ev.useDmaCopy    ? "DMA"   : "GFX");
-
-      if (isBidirectional)
-      {
-        printf("%12s", "SRC\\DST");
-      }
-      else
-      {
-        if (ev.useRemoteRead)
-          printf("%12s", "SRC\\EXE+DST");
-        else
-          printf("%12s", "SRC+EXE\\DST");
-      }
-      for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i);
-      for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i);
-      printf("\n");
-    }
-
-    // Loop over all possible src/dst pairs
-    for (int src = 0; src < numDevices; src++)
-    {
-      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
-      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
-
-      if (!ev.outputToCsv)
-        printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
-
-      for (int dst = 0; dst < numDevices; dst++)
-      {
-        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
-        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
-
-        double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex);
-        if (!ev.outputToCsv)
-        {
-          if (bandwidth == 0)
-            printf("%10s", "N/A");
-          else
-            printf("%10.2f", bandwidth);
-        }
-        else
-        {
-          printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n",
-                 srcType == MEM_CPU ? "CPU" : "GPU", srcIndex,
-                 dstType == MEM_CPU ? "CPU" : "GPU", dstIndex,
-                 isBidirectional ? "bidirectional" : "unidirectional",
-                 ev.useRemoteRead ? "Remote" : "Local",
-                 ev.useDmaCopy ? "DMA" : "GFX",
-                 bandwidth,
-                 N * sizeof(float));
-        }
-        fflush(stdout);
-      }
-      if (!ev.outputToCsv) printf("\n");
-    }
-    if (!ev.outputToCsv) printf("\n");
-  }
-}
-
-double GetPeakBandwidth(EnvVars const& ev, size_t const N,
-                        int     const  isBidirectional,
-                        MemType const  srcType, int const srcIndex,
-                        MemType const  dstType, int const dstIndex)
-{
-  // Skip bidirectional on same device
-  if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
-
-  // Prepare Transfers
-  std::vector<Transfer> transfers(2);
-  transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
-
-  // SRC -> DST
-  transfers[0].numSrcs = transfers[0].numDsts = 1;
-  transfers[0].srcType.push_back(srcType);
-  transfers[0].dstType.push_back(dstType);
-  transfers[0].srcIndex.push_back(srcIndex);
-  transfers[0].dstIndex.push_back(dstIndex);
-
-  // DST -> SRC
-  transfers[1].numSrcs = transfers[1].numDsts = 1;
-  transfers[1].srcType.push_back(dstType);
-  transfers[1].dstType.push_back(srcType);
-  transfers[1].srcIndex.push_back(dstIndex);
-  transfers[1].dstIndex.push_back(srcIndex);
-
-  // Either perform (local read + remote write), or (remote read + local write)
-  ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
-  transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
-  transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
-  transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
-  transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
-  transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-  transfers[1].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-
-  // Remove (DST->SRC) if not bidirectional
-  transfers.resize(isBidirectional + 1);
-
-  // Abort if executing on NUMA node with no CPUs
-  for (int i = 0; i <= isBidirectional; i++)
-  {
-    if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
-      return 0;
-
-#if defined(__NVCC__)
-    // NVIDIA platform cannot access GPU memory directly from CPU executors
-    if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
-        return 0;
-#endif
-  }
-
-  ExecuteTransfers(ev, 0, N, transfers, false);
-
-  // Collect aggregate bandwidth
-  double totalBandwidth = 0;
-  for (int i = 0; i <= isBidirectional; i++)
-  {
-    double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations);
-    double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
-    totalBandwidth += transferBandwidthGbs;
-  }
-  return totalBandwidth;
-}
-
-void Transfer::PrepareSubExecParams(EnvVars const& ev)
-{
-  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
-  // Figure out the sub-array each subExecutor works on for this Transfer
-  // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
-  //   except the very last one, for alignment reasons
-  size_t const N              = this->numBytesActual / sizeof(float);
-  int    const initOffset     = ev.byteOffset / sizeof(float);
-  int    const targetMultiple = ev.blockBytes / sizeof(float);
-
-  // In some cases, there may not be enough data for all subExectors
-  int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
-  this->subExecParam.clear();
-  this->subExecParam.resize(this->numSubExecs);
-
-  size_t assigned = 0;
-  for (int i = 0; i < this->numSubExecs; ++i)
-  {
-    int    const subExecLeft = std::max(0, maxSubExecToUse - i);
-    size_t const leftover    = N - assigned;
-    size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
-
-    SubExecParam& p = this->subExecParam[i];
-    p.N             = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
-    p.numSrcs       = this->numSrcs;
-    p.numDsts       = this->numDsts;
-    for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc)
-      p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset;
-    for (int iDst = 0; iDst < this->numDsts; ++iDst)
-      p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
-
-    if (ev.enableDebug)
-    {
-      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
-             this->transferIndex, i, p.N, assigned, assigned + p.N);
-    }
-
-    p.startCycle = 0;
-    p.stopCycle  = 0;
-    assigned += p.N;
-  }
-
-  this->transferTime = 0.0;
-}
-
-void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
-{
-  size_t N = buffer.size();
-  if (bufferIdx >= 0)
-  {
-    size_t patternLen = ev.fillPattern.size();
-    if (patternLen > 0)
-    {
-      for (size_t i = 0; i < N; ++i)
-        buffer[i] = ev.fillPattern[i % patternLen];
-    }
-    else
-    {
-      for (size_t i = 0; i < N; ++i)
-        buffer[i] = PrepSrcValue(bufferIdx, i);
-    }
-  }
-  else // Destination buffer
-  {
-    if (this->numSrcs == 0)
-    {
-      // Note: 0x75757575 = 13323083.0
-      memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
-    }
-    else
-    {
-      PrepareReference(ev, buffer, 0);
-
-      if (this->numSrcs > 1)
-      {
-        std::vector<float> temp(N);
-        for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
-        {
-          PrepareReference(ev, temp, srcIdx);
-          for (int i = 0; i < N; ++i)
-          {
-            buffer[i] += temp[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-bool Transfer::PrepareSrc(EnvVars const& ev)
-{
-  if (this->numSrcs == 0) return true;
-  size_t const N = this->numBytesActual / sizeof(float);
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  std::vector<float> reference(N);
-  for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
-  {
-    float* srcPtr = this->srcMem[srcIdx] + initOffset;
-    PrepareReference(ev, reference, srcIdx);
-
-    // Initialize source memory array with reference pattern
-    if (IsGpuType(this->srcType[srcIdx]))
-    {
-      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
-      HIP_CALL(hipSetDevice(deviceIdx));
-      if (ev.usePrepSrcKernel)
-        PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx);
-      else
-        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-    }
-    else if (IsCpuType(this->srcType[srcIdx]))
-    {
-      memcpy(srcPtr, reference.data(), this->numBytesActual);
-    }
-
-    // Perform check just to make sure that data has been copied properly
-    float* srcCheckPtr = srcPtr;
-    std::vector<float> srcCopy(N);
-    if (IsGpuType(this->srcType[srcIdx]))
-    {
-      if (!ev.validateDirect)
-      {
-        HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
-        HIP_CALL(hipDeviceSynchronize());
-        srcCheckPtr = srcCopy.data();
-      }
-    }
-
-    for (size_t i = 0; i < N; ++i)
-    {
-      if (reference[i] != srcCheckPtr[i])
-      {
-        printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
-#if !defined(__NVCC__)
-        float const val = this->srcMem[srcIdx][initOffset + i];
-        printf("[ERROR] SRC %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-               srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
-#else
-        printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
-#endif
-        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
-        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
-               this->transferIndex,
-               this->SrcToStr().c_str(),
-               ExeTypeStr[this->exeType], this->exeIndex,
-               this->numSubExecs,
-               this->DstToStr().c_str());
-        if (!ev.continueOnError)
-          exit(1);
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-void Transfer::ValidateDst(EnvVars const& ev)
-{
-  if (this->numDsts == 0) return;
-  size_t const N = this->numBytesActual / sizeof(float);
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  std::vector<float> reference(N);
-  PrepareReference(ev, reference, -1);
-
-  std::vector<float> hostBuffer(N);
-  for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
-  {
-    float* output;
-    if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
-    {
-      output = this->dstMem[dstIdx] + initOffset;
-    }
-    else
-    {
-      int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
-      HIP_CALL(hipSetDevice(deviceIdx));
-      HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-      output = hostBuffer.data();
-    }
-
-    for (size_t i = 0; i < N; ++i)
-    {
-      if (reference[i] != output[i])
-      {
-        printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
-        for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
-        {
-          float srcVal;
-          HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
-#if !defined(__NVCC__)
-          float val = this->srcMem[srcIdx][initOffset + i];
-          printf("[ERROR] SRC %02dD  value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-                 srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
-#else
-          printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
-#endif
-        }
-        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
-#if !defined(__NVCC__)
-        float dstVal = this->dstMem[dstIdx][initOffset + i];
-        printf("[ERROR] DST %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-               dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
-#else
-        printf("[ERROR] DST %02d   value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
-#endif
-        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
-               this->transferIndex,
-               this->SrcToStr().c_str(),
-               ExeTypeStr[this->exeType], this->exeIndex,
-               this->numSubExecs,
-               this->DstToStr().c_str());
-        if (!ev.continueOnError)
-          exit(1);
-        else
-          break;
-      }
-    }
-  }
-}
-
-std::string Transfer::SrcToStr() const
-{
-  if (numSrcs == 0) return "N";
-  std::stringstream ss;
-  for (int i = 0; i < numSrcs; ++i)
-    ss << MemTypeStr[srcType[i]] << srcIndex[i];
-  return ss.str();
-}
-
-std::string Transfer::DstToStr() const
-{
-  if (numDsts == 0) return "N";
-  std::stringstream ss;
-  for (int i = 0; i < numDsts; ++i)
-    ss << MemTypeStr[dstType[i]] << dstIndex[i];
-  return ss.str();
-}
-
-// NOTE: This is a stop-gap solution until HIP provides wallclock values
-int GetWallClockRate(int deviceId)
-{
-  static std::vector<int> wallClockPerDeviceMhz;
-
-  if (wallClockPerDeviceMhz.size() == 0)
-  {
-    int numGpuDevices;
-    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-    wallClockPerDeviceMhz.resize(numGpuDevices);
-
-    for (int i = 0; i < numGpuDevices; i++)
-    {
-#if defined(__NVCC__)
-      int value = 1410000;
-      //HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
-      //value *= 1000;
-#else
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, i));
-      int value = 25000;
-      switch (prop.gcnArch)
-      {
-      case 906: case 910: value = 25000; break;
-      default:
-        printf("Unrecognized GCN arch %d\n", prop.gcnArch);
-      }
-#endif
-      wallClockPerDeviceMhz[i] = value;
-    }
-  }
-  return wallClockPerDeviceMhz[deviceId];
-}
-
-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
-{
-  ev.DisplaySweepEnvVars();
-
-  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
-  std::vector<std::pair<ExeType, int>> exeList;
-  for (auto exe : ev.sweepExe)
-  {
-    ExeType const exeType = CharToExeType(exe);
-    if (IsGpuType(exeType))
-    {
-      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
-        exeList.push_back(std::make_pair(exeType, exeIndex));
-    }
-    else if (IsCpuType(exeType))
-    {
-      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
-      {
-        // Skip NUMA nodes that have no CPUs (e.g. CXL)
-        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
-        exeList.push_back(std::make_pair(exeType, exeIndex));
-      }
-    }
-  }
-  int numExes = exeList.size();
-
-  std::vector<std::pair<MemType, int>> srcList;
-  for (auto src : ev.sweepSrc)
-  {
-    MemType const srcType = CharToMemType(src);
-    int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
-
-    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
-      srcList.push_back(std::make_pair(srcType, srcIndex));
-  }
-  int numSrcs = srcList.size();
-
-
-  std::vector<std::pair<MemType, int>> dstList;
-  for (auto dst : ev.sweepDst)
-  {
-    MemType const dstType = CharToMemType(dst);
-    int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
-
-    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
-      dstList.push_back(std::make_pair(dstType, dstIndex));
-  }
-  int numDsts = dstList.size();
-
-  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
-  struct TransferInfo
-  {
-    MemType srcType; int srcIndex;
-    ExeType exeType; int exeIndex;
-    MemType dstType; int dstIndex;
-  };
-
-  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
-  bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
-
-  std::vector<TransferInfo> possibleTransfers;
-  TransferInfo tinfo;
-  for (int i = 0; i < numExes; ++i)
-  {
-    // Skip CPU executors if XGMI link must be used
-    if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
-    tinfo.exeType  = exeList[i].first;
-    tinfo.exeIndex = exeList[i].second;
-
-    bool isXgmiSrc  = false;
-    int  numHopsSrc = 0;
-    for (int j = 0; j < numSrcs; ++j)
-    {
-      if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
-      {
-        if (exeList[i].second != srcList[j].second)
-        {
-#if defined(__NVCC__)
-          isXgmiSrc = false;
-#else
-          uint32_t exeToSrcLinkType, exeToSrcHopCount;
-          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
-                                                RemappedIndex(srcList[j].second, false),
-                                                &exeToSrcLinkType,
-                                                &exeToSrcHopCount));
-          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
-#endif
-        }
-        else
-        {
-          isXgmiSrc = true;
-          numHopsSrc = 0;
-        }
-
-        // Skip this SRC if it is not XGMI but only XGMI links may be used
-        if (useXgmiOnly && !isXgmiSrc) continue;
-
-        // Skip this SRC if XGMI distance is already past limit
-        if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
-      }
-      else if (useXgmiOnly) continue;
-
-      tinfo.srcType  = srcList[j].first;
-      tinfo.srcIndex = srcList[j].second;
-
-      bool isXgmiDst = false;
-      int  numHopsDst = 0;
-      for (int k = 0; k < numDsts; ++k)
-      {
-        if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
-        {
-          if (exeList[i].second != dstList[k].second)
-          {
-#if defined(__NVCC__)
-            isXgmiSrc = false;
-#else
-            uint32_t exeToDstLinkType, exeToDstHopCount;
-            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
-                                                  RemappedIndex(dstList[k].second, false),
-                                                  &exeToDstLinkType,
-                                                  &exeToDstHopCount));
-            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-            if (isXgmiDst) numHopsDst = exeToDstHopCount;
-#endif
-          }
-          else
-          {
-            isXgmiDst = true;
-            numHopsDst = 0;
-          }
-        }
-
-        // Skip this DST if it is not XGMI but only XGMI links may be used
-        if (useXgmiOnly && !isXgmiDst) continue;
-
-        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
-        if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
-
-        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
-        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
-
-#if defined(__NVCC__)
-        // Skip CPU executors on GPU memory on NVIDIA platform
-        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
-          continue;
-#endif
-
-        tinfo.dstType  = dstList[k].first;
-        tinfo.dstIndex = dstList[k].second;
-
-        possibleTransfers.push_back(tinfo);
-      }
-    }
-  }
-
-  int const numPossible = (int)possibleTransfers.size();
-  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
-
-  if (ev.sweepMin > numPossible)
-  {
-    printf("No valid test configurations exist\n");
-    return;
-  }
-
-  if (ev.outputToCsv)
-  {
-    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
-           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
-  }
-
-  int numTestsRun = 0;
-  int M = ev.sweepMin;
-  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
-  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
-
-  // Log sweep to configuration file
-  FILE *fp = fopen("lastSweep.cfg", "w");
-  if (!fp)
-  {
-    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
-    exit(1);
-  }
-
-  // Create bitmask of numPossible triplets, of which M will be chosen
-  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
-  auto cpuStart = std::chrono::high_resolution_clock::now();
-  while (1)
-  {
-    if (isRandom)
-    {
-      // Pick random number of simultaneous transfers to execute
-      // NOTE: This currently skews distribution due to some #s having more possibilities than others
-      M = distribution(*ev.generator);
-
-      // Generate a random bitmask
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-      std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
-    }
-
-    // Convert bitmask to list of Transfers
-    std::vector<Transfer> transfers;
-    for (int value = 0; value < numPossible; ++value)
-    {
-      if (bitmask[value])
-      {
-        // Convert integer value to (SRC->EXE->DST) triplet
-        Transfer transfer;
-        transfer.numSrcs        = 1;
-        transfer.numDsts        = 1;
-        transfer.srcType        = {possibleTransfers[value].srcType};
-        transfer.srcIndex       = {possibleTransfers[value].srcIndex};
-        transfer.exeType        = possibleTransfers[value].exeType;
-        transfer.exeIndex       = possibleTransfers[value].exeIndex;
-        transfer.dstType        = {possibleTransfers[value].dstType};
-        transfer.dstIndex       = {possibleTransfers[value].dstIndex};
-        transfer.numSubExecs    = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
-        transfer.transferIndex  = transfers.size();
-        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
-        transfers.push_back(transfer);
-      }
-    }
-
-    LogTransfers(fp, ++numTestsRun, transfers);
-    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
-
-    // Check for test limit
-    if (numTestsRun == ev.sweepTestLimit)
-    {
-      printf("Test limit reached\n");
-      break;
-    }
-
-    // Check for time limit
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
-    {
-      printf("Time limit exceeded\n");
-      break;
-    }
-
-    // Increment bitmask if not random sweep
-    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
-    {
-      M++;
-      // Check for completion
-      if (M > maxParallelTransfers)
-      {
-        printf("Sweep complete\n");
-        break;
-      }
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-    }
-  }
-  fclose(fp);
-}
-
-void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
-{
-  fprintf(fp, "# Test %d\n", testNum);
-  fprintf(fp, "%d", -1 * (int)transfers.size());
-  for (auto const& transfer : transfers)
-  {
-    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
-            MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
-            ExeTypeStr[transfer.exeType],    transfer.exeIndex,
-            MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
-            transfer.numSubExecs,
-            transfer.numBytes);
-  }
-  fprintf(fp, "\n");
-  fflush(fp);
-}
-
-std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
-{
-  std::stringstream ss;
-  for (int i = 0; i < strVector.size(); ++i)
-  {
-    if (i) ss << " ";
-    ss << (strVector[i] + initOffset);
-  }
-  return ss.str();
-}
diff --git a/transferbench.so/src/action.cpp b/transferbench.so/src/action.cpp
deleted file mode 100644
index 0d0d4931..00000000
--- a/transferbench.so/src/action.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/********************************************************************************
- *
- * Copyright (c) 2018 ROCm Developer Tools
- *
- * MIT LICENSE:
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
- * of the Software, and to permit persons to whom the Software is furnished to do
- * so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "include/action.h"
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <regex>
-#include <utility>
-#include <algorithm>
-#include <map>
-
-#define __HIP_PLATFORM_HCC__
-
-#include "include/rvs_key_def.h"
-#include "include/tfb_worker.h"
-#include "include/rvsactionbase.h"
-#include "include/rvsloglp.h"
-
-using std::string;
-using std::vector;
-using std::map;
-using std::regex;
-
-#define RVS_CONF_TFB_TYPE_KEY       "transfer-type"
-#define MODULE_NAME                 "transfer-bench"
-#define MODULE_NAME_CAPS            "TRANSFER-BENCH"
-#define TFB_DEFAULT_TYPE            "p2p"
-#define TFB_NO_COMPATIBLE_GPUS          "No AMD compatible GPU found!"
-
-#define FLOATING_POINT_REGEX            "^[0-9]*\\.?[0-9]+$"
-
-#define JSON_CREATE_NODE_ERROR          "JSON cannot create node"
-
-/**
- * @brief default class constructor
- */
-tfb_action::tfb_action() {
-    bjson = false;
-}
-
-/**
- * @brief class destructor
- */
-tfb_action::~tfb_action() {
-    property.clear();
-}
-
-/**
- * @brief runs the hip test session
- * @return true if no error occured, false otherwise
- */
-bool tfb_action::start_tfb_runners() {
-    size_t k = 0;
-    // one worker sufficient, as test runner
-    tfbWorker worker;
-    worker.set_name(action_name);
-    worker.set_transfer_type(m_transfer_type);
-    worker.start();
-    worker.join();
-
-    return rvs::lp::Stopping() ? false : true;
-}
-
-/**
- * @brief reads all GST-related configuration keys from
- * the module's properties collection
- * @return true if no fatal error occured, false otherwise
- */
-bool tfb_action::get_all_tfb_config_keys(void) {
-    int error;
-    string msg;
-    bool bsts = true;
-
-    if (property_get<std::string>(RVS_CONF_TFB_TYPE_KEY, &m_transfer_type,
-            TFB_DEFAULT_TYPE)) {
-         msg = "invalid '" +
-         std::string(RVS_CONF_TFB_TYPE_KEY) + "' key value";
-         rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
-         bsts = false;
-    }
-
-    return bsts;
-}
-
-/**
- * @brief reads all common configuration keys from
- * the module's properties collection
- * @return true if no fatal error occured, false otherwise
- */
-bool tfb_action::get_all_common_config_keys(void) {
-    string msg, sdevid, sdev;
-    int error;
-    bool bsts = true;
-
-
-    // place holder for later
-
-    return bsts;
-}
-
-/**
- * @brief gets the number of ROCm compatible AMD GPUs
- * @return run number of GPUs
- */
-int tfb_action::get_num_amd_gpu_devices(void) {
-    int hip_num_gpu_devices;
-    string msg;
-
-    hipGetDeviceCount(&hip_num_gpu_devices);
-    if (hip_num_gpu_devices == 0) {  // no AMD compatible GPU
-        msg = action_name + " " + MODULE_NAME + " " + TFB_NO_COMPATIBLE_GPUS;
-        rvs::lp::Log(msg, rvs::logerror);
-
-        if (bjson) {
-            unsigned int sec;
-            unsigned int usec;
-            rvs::lp::get_ticks(&sec, &usec);
-            void *json_root_node = rvs::lp::LogRecordCreate(MODULE_NAME,
-                            action_name.c_str(), rvs::loginfo, sec, usec, true);
-            if (!json_root_node) {
-                // log the error
-                string msg = std::string(JSON_CREATE_NODE_ERROR);
-                rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
-                return -1;
-            }
-
-            rvs::lp::AddString(json_root_node, "ERROR", TFB_NO_COMPATIBLE_GPUS);
-            rvs::lp::LogRecordFlush(json_root_node, rvs::loginfo);
-        }
-        return 0;
-    }
-    return hip_num_gpu_devices;
-}
-
-/**
- * @brief gets all selected GPUs and starts the worker threads
- * @return run result
- */
-int tfb_action::run_transferbench(void) {
-    int hip_num_gpu_devices;
-    std::string msg;
-
-    hip_num_gpu_devices = get_num_amd_gpu_devices();
-     
-    if (hip_num_gpu_devices > 0) {
-        if (start_tfb_runners())
-            return 0;
-
-        return -1;
-    } else {
-      msg = "No devices match criteria from the test configuation.";
-      rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
-      return -1;
-    }
-
-    return 0;
-}
-
-/**
- * @brief runs the whole GST logic
- * @return run result
- */
-int tfb_action::run(void) {
-    string msg;
-
-    // get the action name
-    if (property_get(RVS_CONF_NAME_KEY, &action_name)) {
-      rvs::lp::Err("Action name missing", MODULE_NAME_CAPS);
-      return -1;
-    }
-
-    // check for -j flag (json logging)
-    if (property.find("cli.-j") != property.end())
-        bjson = true;
-    if (!get_all_common_config_keys())
-        return -1;
-    if (!get_all_tfb_config_keys())
-        return -1;
-
-    return run_transferbench();
-}
diff --git a/transferbench.so/src/hiptest_worker.cpp b/transferbench.so/src/hiptest_worker.cpp
deleted file mode 100644
index 9a31fc32..00000000
--- a/transferbench.so/src/hiptest_worker.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/********************************************************************************
- *
- * Copyright (c) 2018 ROCm Developer Tools
- *
- * MIT LICENSE:
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
- * of the Software, and to permit persons to whom the Software is furnished to do
- * so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "include/hiptest_worker.h"
-
-#include <unistd.h>
-#include <string>
-#include <memory>
-#include <iostream>
-#include "include/rvs_blas.h"
-#include "include/rvs_module.h"
-#include "include/rvsloglp.h"
-#include "include/rvs_util.h"
-
-#include <sys/types.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/wait.h>
-
-#define MODULE_NAME                             "hiptest"
-
-using std::string;
-
-//bool hipTestWorker::bjson = false;
-
-hipTestWorker::hipTestWorker() {}
-hipTestWorker::~hipTestWorker() {}
-
-
-/**
- * @brief performs the stress test on the given GPU
- */
-void hipTestWorker::run() {
-    string msg, err_description;
-    int error = 0;
-
-
-    // log GST stress test - start message
-    msg = "[" + action_name + "] " + MODULE_NAME + " " +
-            " Starting the Hip test "; 
-    rvs::lp::Log(msg, rvs::logtrace);
-
-    // let the GPU ramp-up and check the result
-    bool hipsuccess = start_hip_tests(error, err_description);
-
-    // GPU was not able to do the processing (HIP/rocBlas error(s) occurred)
-    if (error) {
-        string msg = "[" + action_name + "] " + MODULE_NAME + " "
-                         + err_description;
-        rvs::lp::Log(msg, rvs::logerror);
-        return;
-    }
-
-}
-
-/**
- * @brief forks and execs test result
- * @param return true if test succeeded, false otherwise
- */
-
-bool hipTestWorker::start_hip_tests(int &error, string &errdesc){
-    int pid, status;
-    auto found = m_test_path.find_last_of('/');
-    auto fname = m_test_path.substr(found+1);
-    if((pid = fork()) == 0){ // child
-	execl(m_test_path.c_str(), fname.c_str(), m_test_args.c_str(), 0);
-    }else{
-	waitpid(pid, &status, 0);
-	if (WIFEXITED(status)){
-	    error = 0;
-	    return true;
-	}
-	return false;
-    }
-}
diff --git a/transferbench.so/src/rvs_module.cpp b/transferbench.so/src/rvs_module.cpp
deleted file mode 100644
index f96a16f1..00000000
--- a/transferbench.so/src/rvs_module.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/********************************************************************************
- *
- * Copyright (c) 2018 ROCm Developer Tools
- *
- * MIT LICENSE:
- * Permission is hereby granted, free of charge, to any person obtaining a copy of
- * this software and associated documentation files (the "Software"), to deal in
- * the Software without restriction, including without limitation the rights to
- * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
- * of the Software, and to permit persons to whom the Software is furnished to do
- * so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include "include/rvs_module.h"
-#include "include/action.h"
-#include "include/rvsloglp.h"
-#include "include/gpu_util.h"
-
-/**
- * @defgroup GST GST Module
- *
- * @brief performs GPU Stress Test
- *
- * The GPU Stress Test runs a Graphics Stress test or SGEMM/DGEMM
- * (Single/Double-precision General Matrix Multiplication) workload
- * on one, some or all GPUs. The GPUs can be of the same or different types.
- * The duration of the benchmark should be configurable, both in terms of time
- * (how long to run) and iterations (how many times to run).
- * 
- */
-
-extern "C" int rvs_module_has_interface(int iid) {
-  int sts = 0;
-  switch (iid) {
-  case 0:
-  case 1:
-    sts = 1;
-  }
-  return sts;
-}
-
-extern "C" const char* rvs_module_get_description(void) {
-    return "ROCm Validation Suite HipTest module";
-}
-
-extern "C" const char* rvs_module_get_config(void) {
-    return "test_path (string)" ;
-}
-
-extern "C" const char* rvs_module_get_output(void) {
-    return "pass (bool)";
-}
-
-extern "C" int rvs_module_init(void* pMi) {
-    rvs::lp::Initialize(static_cast<T_MODULE_INIT*>(pMi));
-    rvs::gpulist::Initialize();
-    return 0;
-}
-
-extern "C" int rvs_module_terminate(void) {
-    return 0;
-}
-
-extern "C" void* rvs_module_action_create(void) {
-    return static_cast<void*>(new tfb_action);
-}
-
-extern "C" int   rvs_module_action_destroy(void* pAction) {
-    delete static_cast<rvs::actionbase*>(pAction);
-    return 0;
-}
-
-extern "C" int rvs_module_action_property_set(void* pAction, const char* Key,
-                                                            const char* Val) {
-    return static_cast<rvs::actionbase*>(pAction)->property_set(Key, Val);
-}
-
-extern "C" int rvs_module_action_run(void* pAction) {
-    return static_cast<rvs::actionbase*>(pAction)->run();
-}
diff --git a/transferbench.so/src/tfb_worker.cpp b/transferbench.so/src/tfb_worker.cpp
deleted file mode 100644
index 5ec11185..00000000
--- a/transferbench.so/src/tfb_worker.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
-Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-// This program measures simultaneous copy performance across multiple GPUs
-// on the same node
-#include <numa.h>
-#include <numaif.h>
-#include <random>
-#include <stack>
-#include <thread>
-#include "include/tfb_worker.h"
-#include "include/rvsloglp.h"
-#include "include/TransferBench.hpp"
-#include "include/GetClosestNumaNode.hpp"
-
-static const std::string MODULE_NAME{"transfer-bench"};
-/**
- * @brief performs the stress test on the given GPU
- */
-void tfbWorker::run() {
-	std::string msg, err_description;
-    int error = 0;
-
-
-    // log GST stress test - start message
-    msg = "[" + action_name + "] " + MODULE_NAME + " " +
-            " Starting the Hip test ";
-    rvs::lp::Log(msg, rvs::logtrace);
-
-    // let the GPU ramp-up and check the result
-   int sts = TfbRun(error, err_description);
-
-    // GPU was not able to do the processing (HIP/rocBlas error(s) occurred)
-    if (error) {
-	    std::string msg = "[" + action_name + "] " + MODULE_NAME + " "
-                         + err_description;
-        rvs::lp::Log(msg, rvs::logerror);
-        return;
-    }
-
-}
-
-
-int tfbWorker::TfbRun(int &err, std::string &errmsg)
-{
-  // Check for NUMA library support
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
-
-
-  // Collect environment variables / display current run configuration
-  EnvVars ev;
-
-  // Determine number of bytes to run per Transfer
-  //size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
-  size_t numBytesPerTransfer = DEFAULT_BYTES_PER_TRANSFER*1024*1024;
-  /*
-  if (argc > 2)
-  {
-    // Adjust bytes if unit specified
-    char units = argv[2][strlen(argv[2])-1];
-    switch (units)
-    {
-    case 'K': case 'k': numBytesPerTransfer *= 1024; break;
-    case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
-    case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
-    }
-  }*/
-  if (numBytesPerTransfer % 4)
-  {
-    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
-    exit(1);
-  }
-
-    ev.configMode = CFG_P2P;
-    RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float));
-
-
-  return 0;
-}
-
-
diff --git a/transferbench.so/tests.cmake b/transferbench.so/tests.cmake
deleted file mode 100644
index c1b6b74e..00000000
--- a/transferbench.so/tests.cmake
+++ /dev/null
@@ -1,27 +0,0 @@
-################################################################################
-##
-## Copyright (c) 2018 ROCm Developer Tools
-##
-## MIT LICENSE:
-## Permission is hereby granted, free of charge, to any person obtaining a copy of
-## this software and associated documentation files (the "Software"), to deal in
-## the Software without restriction, including without limitation the rights to
-## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-## of the Software, and to permit persons to whom the Software is furnished to do
-## so, subject to the following conditions:
-##
-## The above copyright notice and this permission notice shall be included in all
-## copies or substantial portions of the Software.
-##
-## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-## SOFTWARE.
-##
-################################################################################
-
-
-include(tests_conf_logging)
diff --git a/transferbench.so/transferbench/CMakeLists.txt b/transferbench.so/transferbench/CMakeLists.txt
deleted file mode 100644
index 18df727b..00000000
--- a/transferbench.so/transferbench/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-if (DEFINED ENV{ROCM_PATH})
-    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
-else()
-    set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory")
-endif()
-cmake_minimum_required(VERSION 3.5)
-
-project(TransferBench VERSION 1.0 LANGUAGES CXX)
-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
-include_directories(${ROCM_PATH}/include)
-link_libraries(numa hsa-runtime64 pthread)
-set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..)
-add_executable(TransferBench src/TransferBench.cpp)
-target_include_directories(TransferBench PRIVATE src/include)
-
-find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
-include(ROCMInstallTargets)
-include(ROCMCreatePackage)
-set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF)
-
-set(PACKAGE_NAME TB)
-set(LIBRARY_NAME TransferBench)
-
-rocm_install(TARGETS TransferBench)
-
-rocm_create_package(
-    NAME ${LIBRARY_NAME}
-    DESCRIPTION "TransferBench package"
-    MAINTAINER "RCCL Team <gilbert.lee@amd.com>"
-)