diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a750b23..0ddd669f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -474,7 +474,7 @@ add_subdirectory(rcqt.so)
 add_subdirectory(smqt.so)
 add_subdirectory(mem.so)
 add_subdirectory(babel.so)
-#add_subdirectory(edp.so)
+add_subdirectory(tfb.so)
 add_subdirectory(perf.so)
 
 if (RVS_BUILD_TESTS)
diff --git a/rvs/.rvsmodules.config b/rvs/.rvsmodules.config
index 047d0cac..af55776c 100644
--- a/rvs/.rvsmodules.config
+++ b/rvs/.rvsmodules.config
@@ -12,3 +12,4 @@ iet: libiet.so
 mem: libmem.so
 babel: libbabel.so
 perf: libperf.so
+tfb:  libtfb.so
diff --git a/rvs/conf/tfb_single.conf b/rvs/conf/tfb_single.conf
new file mode 100644
index 00000000..4d7324c6
--- /dev/null
+++ b/rvs/conf/tfb_single.conf
@@ -0,0 +1,37 @@
+# ################################################################################
+# #
+# # Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+# #
+# # MIT LICENSE:
+# # Permission is hereby granted, free of charge, to any person obtaining a copy of
+# # this software and associated documentation files (the "Software"), to deal in
+# # the Software without restriction, including without limitation the rights to
+# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+# # of the Software, and to permit persons to whom the Software is furnished to do
+# # so, subject to the following conditions:
+# #
+# # The above copyright notice and this permission notice shall be included in all
+# # copies or substantial portions of the Software.
+# #
+# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# # SOFTWARE.
+# #
+# ###############################################################################
+
+
+# Hip test
+#
+# Preconditions:
+#   sudo ./rvs -c conf/tfb_single.conf -d 3
+#
+# Expected result:
+#   run transfer bench tests
+actions:
+- name: p2p-benchmarks
+  module: tfb 
+  transfer-type: p2p 
diff --git a/rvs/src/rvsmodule.cpp b/rvs/src/rvsmodule.cpp
index 0aebb1fe..cdf0eec6 100644
--- a/rvs/src/rvsmodule.cpp
+++ b/rvs/src/rvsmodule.cpp
@@ -168,6 +168,7 @@ rvs::module* rvs::module::find_create_module(const char* name) {
       libpath += "../lib/rvs/";
     }
     string sofullname(libpath + it->second);
+    std::cout << "MANOJ:::: " << sofullname << std::endl;
     void* psolib = dlopen(sofullname.c_str(), RTLD_NOW);
     // error?
     if (!psolib) {
@@ -227,7 +228,8 @@ rvs::module* rvs::module::find_create_module(const char* name) {
     }
 
     // add to map
-    modulemap.insert(t_mmpair(name, m));
+    std::cout << "module name manoj is :" <<  name;
+
   } else {
     m = it->second;
   }
diff --git a/tfb.so/CMakeLists.txt b/tfb.so/CMakeLists.txt
new file mode 100644
index 00000000..ee2002d9
--- /dev/null
+++ b/tfb.so/CMakeLists.txt
@@ -0,0 +1,183 @@
+################################################################################
+##
+## Copyright (c) 2018 ROCm Developer Tools
+##
+## MIT LICENSE:
+## Permission is hereby granted, free of charge, to any person obtaining a copy of
+## this software and associated documentation files (the "Software"), to deal in
+## the Software without restriction, including without limitation the rights to
+## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+## of the Software, and to permit persons to whom the Software is furnished to do
+## so, subject to the following conditions:
+##
+## The above copyright notice and this permission notice shall be included in all
+## copies or substantial portions of the Software.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+## SOFTWARE.
+##
+################################################################################
+
+cmake_minimum_required ( VERSION 3.5.0 )
+if ( ${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
+  message(FATAL "In-source build is not allowed")
+endif ()
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+
+set ( RVS "tfb" )
+set ( RVS_PACKAGE "rvs-roct" )
+set ( RVS_COMPONENT "lib${RVS}" )
+set ( RVS_TARGET "${RVS}" )
+
+project ( ${RVS_TARGET} )
+
+message(STATUS "MODULE: ${RVS}")
+add_compile_options(-std=c++11)
+add_compile_options(-Wall )
+if (RVS_COVERAGE)
+  add_compile_options(-o0 -fprofile-arcs -ftest-coverage)
+  set(CMAKE_EXE_LINKER_FLAGS "--coverage")
+  set(CMAKE_SHARED_LINKER_FLAGS "--coverage")
+endif()
+
+## Set default module path if not already set
+if ( NOT DEFINED CMAKE_MODULE_PATH )
+    set ( CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake_modules/" )
+endif ()
+
+## Include common cmake modules
+include ( utils )
+
+## Setup the package version.
+get_version ( "0.0.0" )
+
+set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} )
+set ( BUILD_VERSION_MINOR ${VERSION_MINOR} )
+set ( BUILD_VERSION_PATCH ${VERSION_PATCH} )
+set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
+
+if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" )
+    set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" )
+endif ()
+set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
+
+## make version numbers visible to C code
+add_compile_options(-DBUILD_VERSION_MAJOR=${VERSION_MAJOR})
+add_compile_options(-DBUILD_VERSION_MINOR=${VERSION_MINOR})
+add_compile_options(-DBUILD_VERSION_PATCH=${VERSION_PATCH})
+add_compile_options(-DLIB_VERSION_STRING="${LIB_VERSION_STRING}")
+add_compile_options(-DBUILD_VERSION_STRING="${BUILD_VERSION_STRING}")
+
+set(ROCBLAS_LIB "rocblas")
+set(HIP_HCC_LIB "amdhip64")
+
+# Determine HSA_PATH
+if(NOT DEFINED HIPCC_PATH)
+  if(NOT DEFINED ENV{HIPCC_PATH})
+    set(HIPCC_PATH "${ROCM_PATH}/hip" CACHE PATH "Path to which hipcc runtime has been installed")
+     else()
+       set(HIPCC_PATH $ENV{HIPCC_PATH} CACHE PATH "Path to which hipcc runtime has been installed")
+     endif()
+endif()
+
+# Determine HSA_PATH
+if(NOT DEFINED HSA_PATH)
+     if(NOT DEFINED ENV{HSA_PATH})
+	     set(HSA_PATH "${ROCM_PATH}/hsa" CACHE PATH "Path to which HSA runtime has been installed")
+     else()
+          set(HSA_PATH $ENV{HSA_PATH} CACHE PATH "Path to which HSA runtime has been installed")
+     endif()
+endif()
+
+# Add HIP_VERSION to CMAKE_<LANG>_FLAGS
+set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}")
+
+set(HIP_HCC_BUILD_FLAGS)
+set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include ${ASAN_CXX_FLAGS}")
+
+# Set compiler and compiler flags
+set(CMAKE_CXX_COMPILER "${HIPCC_PATH}/bin/hipcc")
+set(CMAKE_C_COMPILER   "${HIPCC_PATH}/bin/hipcc")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS} -L${ROCM_PATH}/lib")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LD_FLAGS}")
+
+if(BUILD_ADDRESS_SANITIZER)
+  execute_process(COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.asan-x86_64.so
+            OUTPUT_VARIABLE ASAN_LIB_FULL_PATH)
+  get_filename_component(ASAN_LIB_PATH ${ASAN_LIB_FULL_PATH} DIRECTORY)
+endif()
+
+# Determine Roc Runtime header files are accessible
+if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h)
+  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
+  RETURN()
+endif()
+
+if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h)
+  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
+  RETURN()
+endif()
+
+# Determine Roc Runtime header files are accessible
+if(DEFINED RVS_ROCMSMI)
+  if(NOT RVS_ROCMSMI EQUAL 1)
+    if(NOT EXISTS ${ROCBLAS_INC_DIR}/rocblas.h)
+    message("ERROR: rocBLAS headers can't be found under specified path. Please set ROCBLAS_INC_DIR path. Current value is : " ${ROCBLAS_INC_DIR})
+    RETURN()
+    endif()
+
+    if(NOT EXISTS "${ROCBLAS_LIB_DIR}/lib${ROCBLAS_LIB}.so")
+      message("ERROR: rocBLAS library can't be found under specified path. Please set ROCBLAS_LIB_DIR path. Current value is : " ${ROCBLAS_LIB_DIR})
+      RETURN()
+    endif()
+  endif()
+endif()
+
+
+if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
+  message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+  RETURN()
+endif()
+
+## define include directories
+include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${ROCM_PATH}/include)
+# Add directories to look for library files to link
+link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
+## additional libraries
+set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so numa hsa-runtime64 )
+
+## define source files
+set(SOURCES src/rvs_module.cpp src/action.cpp src/tfb_worker.cpp)
+
+## define target
+add_library( ${RVS_TARGET} SHARED ${SOURCES})
+set_target_properties(${RVS_TARGET} PROPERTIES
+        SUFFIX .so.${LIB_VERSION_STRING}
+        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
+#add_dependencies(${RVS_TARGET} rvslibrt rvslib)
+add_dependencies(${RVS_TARGET} rvslib)
+
+add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+COMMAND ln -fs ./lib${RVS}.so.${LIB_VERSION_STRING} lib${RVS}.so.${VERSION_MAJOR} WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+COMMAND ln -fs ./lib${RVS}.so.${VERSION_MAJOR} lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+)
+
+install(TARGETS ${RVS_TARGET} LIBRARY DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
+install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR}" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
+install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule)
+
+# TEST SECTION
+#if (RVS_BUILD_TESTS)
+#  add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+#  COMMAND ln -fs ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR} ${RVS_BINTEST_FOLDER}/lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+#  )
+#  include(${CMAKE_CURRENT_SOURCE_DIR}/tests.cmake)
+#endif()
diff --git a/tfb.so/include/Compatibility.hpp b/tfb.so/include/Compatibility.hpp
new file mode 100644
index 00000000..5e76cf50
--- /dev/null
+++ b/tfb.so/include/Compatibility.hpp
@@ -0,0 +1,93 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__NVCC__)
+
+#include <cuda_runtime.h>
+
+// ROCm specific
+#define __builtin_amdgcn_s_memrealtime                     clock64
+#define gcnArchName                                        name
+
+// Datatypes
+#define hipDeviceProp_t                                    cudaDeviceProp
+#define hipError_t                                         cudaError_t
+#define hipEvent_t                                         cudaEvent_t
+#define hipStream_t                                        cudaStream_t
+
+// Enumerations
+#define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
+#define hipDeviceAttributeMaxSharedMemoryPerMultiprocessor cudaDevAttrMaxSharedMemoryPerMultiprocessor
+#define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
+#define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
+#define hipFuncCachePreferShared                           cudaFuncCachePreferShared
+#define hipMemcpyDefault                                   cudaMemcpyDefault
+#define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
+#define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
+#define hipSuccess                                         cudaSuccess
+
+// Functions
+#define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
+#define hipDeviceEnablePeerAccess                          cudaDeviceEnablePeerAccess
+#define hipDeviceGetAttribute                              cudaDeviceGetAttribute
+#define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
+#define hipDeviceSetCacheConfig                            cudaDeviceSetCacheConfig
+#define hipDeviceSynchronize                               cudaDeviceSynchronize
+#define hipEventCreate                                     cudaEventCreate
+#define hipEventDestroy                                    cudaEventDestroy
+#define hipEventElapsedTime                                cudaEventElapsedTime
+#define hipEventRecord                                     cudaEventRecord
+#define hipFree                                            cudaFree
+#define hipGetDeviceCount                                  cudaGetDeviceCount
+#define hipGetDeviceProperties                             cudaGetDeviceProperties
+#define hipGetErrorString                                  cudaGetErrorString
+#define hipHostFree                                        cudaFreeHost
+#define hipHostMalloc                                      cudaMallocHost
+#define hipMalloc                                          cudaMalloc
+#define hipMemcpy                                          cudaMemcpy
+#define hipMemcpyAsync                                     cudaMemcpyAsync
+#define hipMemset                                          cudaMemset
+#define hipMemsetAsync                                     cudaMemsetAsync
+#define hipSetDevice                                       cudaSetDevice
+#define hipStreamCreate                                    cudaStreamCreate
+#define hipStreamDestroy                                   cudaStreamDestroy
+#define hipStreamSynchronize                               cudaStreamSynchronize
+
+// Define float4 addition operator for NVIDIA platform
+__device__ inline float4& operator +=(float4& a, const float4& b)
+{
+  a.x += b.x;
+  a.y += b.y;
+  a.z += b.z;
+  a.w += b.w;
+  return a;
+}
+
+#else
+
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+#include <hsa/hsa_ext_amd.h>
+
+#endif
diff --git a/tfb.so/include/EnvVars.hpp b/tfb.so/include/EnvVars.hpp
new file mode 100644
index 00000000..bac0746e
--- /dev/null
+++ b/tfb.so/include/EnvVars.hpp
@@ -0,0 +1,510 @@
+/*
+Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ENVVARS_HPP
+#define ENVVARS_HPP
+
+#include <algorithm>
+#include <random>
+#include <time.h>
+#include "Compatibility.hpp"
+#include "Kernels.hpp"
+
+#define TB_VERSION "1.22"
+
+extern char const MemTypeStr[];
+extern char const ExeTypeStr[];
+
+enum ConfigModeEnum
+{
+  CFG_FILE  = 0,
+  CFG_P2P   = 1,
+  CFG_SWEEP = 2
+};
+
+// This class manages environment variable that affect TransferBench
+class EnvVars
+{
+public:
+  // Default configuration values
+  int const DEFAULT_NUM_WARMUPS          =  1;
+  int const DEFAULT_NUM_ITERATIONS       = 10;
+  int const DEFAULT_SAMPLING_FACTOR      =  1;
+
+  // Peer-to-peer Benchmark preset defaults
+  int const DEFAULT_P2P_NUM_CPU_SE    = 4;
+
+  // Sweep-preset defaults
+  std::string const DEFAULT_SWEEP_SRC = "CG";
+  std::string const DEFAULT_SWEEP_EXE = "CDG";
+  std::string const DEFAULT_SWEEP_DST = "CG";
+  int const DEFAULT_SWEEP_MIN         = 1;
+  int const DEFAULT_SWEEP_MAX         = 24;
+  int const DEFAULT_SWEEP_TEST_LIMIT  = 0;
+  int const DEFAULT_SWEEP_TIME_LIMIT  = 0;
+
+  // Environment variables
+  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
+  int byteOffset;        // Byte-offset for memory allocations
+  int continueOnError;   // Continue tests even after mismatch detected
+  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
+  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
+  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
+  int numWarmups;        // Number of un-timed warmup iterations to perform
+  int outputToCsv;       // Output in CSV format
+  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
+  int sharedMemBytes;    // Amount of shared memory to use per threadblock
+  int useInteractive;    // Pause for user-input before starting transfer loop
+  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
+  int usePrepSrcKernel;  // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
+  int useSingleStream;   // Use a single stream per GPU GFX executor instead of stream per Transfer
+  int validateDirect;    // Validate GPU destination memory directly instead of staging GPU memory on host
+
+  std::vector<float> fillPattern; // Pattern of floats used to fill source data
+
+  // Environment variables only for Benchmark-preset
+  int useRemoteRead;     // Use destination memory type as executor instead of source memory type
+  int useDmaCopy;        // Use DMA copy instead of GPU copy
+  int numGpuSubExecs;    // Number of GPU subexecutors to use
+  int numCpuSubExecs;    // Number of CPU subexecttors to use
+
+  // Environment variables only for Sweep-preset
+  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
+  int sweepMax;          // Max number of simulatneous Transfers to be executed per test
+  int sweepTestLimit;    // Max number of tests to run during sweep (0 = no limit)
+  int sweepTimeLimit;    // Max number of seconds to run sweep for  (0 = no limit)
+  int sweepXgmiMin;      // Min number of XGMI hops for Transfers
+  int sweepXgmiMax;      // Max number of XGMI hops for Transfers (-1 = no limit)
+  int sweepSeed;         // Random seed to use
+  int sweepRandBytes;    // Whether or not to use random number of bytes per Transfer
+  std::string sweepSrc;  // Set of src memory types to be swept
+  std::string sweepExe;  // Set of executors to be swept
+  std::string sweepDst;  // Set of dst memory types to be swept
+
+  // Developer features
+  int enableDebug;       // Enable debug output
+  int gpuKernel;         // Which GPU kernel to use
+
+  // Used to track current configuration mode
+  ConfigModeEnum configMode;
+
+  // Random generator
+  std::default_random_engine *generator;
+
+  // Track how many CPUs are available per NUMA node
+  std::vector<int> numCpusPerNuma;
+
+  // Constructor that collects values
+  EnvVars()
+  {
+    int maxSharedMemBytes = 0;
+    HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes,
+                                   hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0));
+#if !defined(__NVCC__)
+    int defaultSharedMemBytes = maxSharedMemBytes / 2 + 1;
+#else
+    int defaultSharedMemBytes = 0;
+#endif
+
+    int numDeviceCUs = 0;
+    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0));
+
+    int numDetectedCpus = numa_num_configured_nodes();
+    int numDetectedGpus;
+    HIP_CALL(hipGetDeviceCount(&numDetectedGpus));
+
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, 0));
+    std::string fullName = prop.gcnArchName;
+    std::string archName = fullName.substr(0, fullName.find(':'));
+
+    // Different hardware pick different GPU kernels
+    // This performance difference is generally only noticable when executing fewer CUs
+    int defaultGpuKernel = 0;
+    if      (archName == "gfx906") defaultGpuKernel = 13;
+    else if (archName == "gfx90a") defaultGpuKernel = 9;
+
+    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
+    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    continueOnError   = GetEnvVar("CONTINUE_ON_ERROR"   , 0);
+    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
+    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
+    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
+    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
+    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
+    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
+    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , defaultSharedMemBytes);
+    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
+    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
+    usePrepSrcKernel  = GetEnvVar("USE_PREP_KERNEL"     , 0);
+    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);
+    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
+    enableDebug       = GetEnvVar("DEBUG"               , 0);
+    gpuKernel         = GetEnvVar("GPU_KERNEL"          , defaultGpuKernel);
+
+    // P2P Benchmark related
+    useRemoteRead    = GetEnvVar("USE_REMOTE_READ"      , 0);
+    useDmaCopy       = GetEnvVar("USE_GPU_DMA"          , 0);
+    numGpuSubExecs   = GetEnvVar("NUM_GPU_SE"           , useDmaCopy ? 1 : numDeviceCUs);
+    numCpuSubExecs   = GetEnvVar("NUM_CPU_SE"           , DEFAULT_P2P_NUM_CPU_SE);
+
+    // Sweep related
+    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
+    sweepMax          = GetEnvVar("SWEEP_MAX"           , DEFAULT_SWEEP_MAX);
+    sweepSrc          = GetEnvVar("SWEEP_SRC"           , DEFAULT_SWEEP_SRC);
+    sweepExe          = GetEnvVar("SWEEP_EXE"           , DEFAULT_SWEEP_EXE);
+    sweepDst          = GetEnvVar("SWEEP_DST"           , DEFAULT_SWEEP_DST);
+    sweepTestLimit    = GetEnvVar("SWEEP_TEST_LIMIT"    , DEFAULT_SWEEP_TEST_LIMIT);
+    sweepTimeLimit    = GetEnvVar("SWEEP_TIME_LIMIT"    , DEFAULT_SWEEP_TIME_LIMIT);
+    sweepXgmiMin      = GetEnvVar("SWEEP_XGMI_MIN"      , 0);
+    sweepXgmiMax      = GetEnvVar("SWEEP_XGMI_MAX"      , -1);
+    sweepRandBytes    = GetEnvVar("SWEEP_RAND_BYTES"    , 0);
+
+    // Determine random seed
+    char *sweepSeedStr = getenv("SWEEP_SEED");
+    sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL));
+    generator = new std::default_random_engine(sweepSeed);
+
+    // Check for fill pattern
+    char* pattern = getenv("FILL_PATTERN");
+    if (pattern != NULL)
+    {
+      if (usePrepSrcKernel)
+      {
+        printf("[ERROR] Unable to use FILL_PATTERN and USE_PREP_KERNEL together\n");
+        exit(1);
+      }
+
+      int patternLen = strlen(pattern);
+      if (patternLen % 2)
+      {
+        printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
+        exit(1);
+      }
+
+      // Read in bytes
+      std::vector<unsigned char> bytes;
+      unsigned char val = 0;
+      for (int i = 0; i < patternLen; i++)
+      {
+        if ('0' <= pattern[i] && pattern[i] <= '9')
+          val += (pattern[i] - '0');
+        else if ('A' <= pattern[i] && pattern[i] <= 'F')
+          val += (pattern[i] - 'A' + 10);
+        else if ('a' <= pattern[i] && pattern[i] <= 'f')
+          val += (pattern[i] - 'a' + 10);
+        else
+        {
+          printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F).  (not %c)\n", pattern[i]);
+          exit(1);
+        }
+
+        if (i % 2 == 0)
+          val <<= 4;
+        else
+        {
+          bytes.push_back(val);
+          val = 0;
+        }
+      }
+
+      // Reverse bytes (input is assumed to be given in big-endian)
+      std::reverse(bytes.begin(), bytes.end());
+
+      // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
+      int copies;
+      switch (patternLen % 8)
+      {
+      case 0:  copies = 1; break;
+      case 4:  copies = 2; break;
+      default: copies = 4; break;
+      }
+
+      // Fill floats
+      int numFloats = copies * patternLen / 8;
+      fillPattern.resize(numFloats);
+      unsigned char* rawData = (unsigned char*) fillPattern.data();
+      for (int i = 0; i < numFloats * 4; i++)
+        rawData[i] = bytes[i % bytes.size()];
+    }
+    else fillPattern.clear();
+
+    // Perform some basic validation
+    if (numCpuDevices > numDetectedCpus)
+    {
+      printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
+      exit(1);
+    }
+    if (numGpuDevices > numDetectedGpus)
+    {
+      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
+      exit(1);
+    }
+    if (byteOffset % sizeof(float))
+    {
+      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
+      exit(1);
+    }
+    if (numWarmups < 0)
+    {
+      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
+      exit(1);
+    }
+    if (samplingFactor < 1)
+    {
+      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
+      exit(1);
+    }
+    if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
+    {
+      printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes);
+      exit(1);
+    }
+    if (blockBytes <= 0 || blockBytes % 4)
+    {
+      printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
+      exit(1);
+    }
+
+    if (numGpuSubExecs <= 0)
+    {
+      printf("[ERROR] NUM_GPU_SE must be greater than 0\n");
+      exit(1);
+    }
+
+    if (numCpuSubExecs <= 0)
+    {
+      printf("[ERROR] NUM_CPU_SE must be greater than 0\n");
+      exit(1);
+    }
+
+    for (auto ch : sweepSrc)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
+        exit(1);
+      }
+    }
+
+    for (auto ch : sweepDst)
+    {
+      if (!strchr(MemTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
+        exit(1);
+      }
+    }
+
+    for (auto ch : sweepExe)
+    {
+      if (!strchr(ExeTypeStr, ch))
+      {
+        printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+      if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
+      {
+        printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
+        exit(1);
+      }
+    }
+    if (gpuKernel < 0 || gpuKernel > NUM_GPU_KERNELS)
+    {
+      printf("[ERROR] GPU kernel must be between 0 and %d\n", NUM_GPU_KERNELS);
+      exit(1);
+    }
+
+    // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
+    numCpusPerNuma.resize(numDetectedCpus);
+    int const totalCpus = numa_num_configured_cpus();
+    for (int i = 0; i < totalCpus; i++)
+      numCpusPerNuma[numa_node_of_cpu(i)]++;
+
+    // Check for deprecated env vars
+    if (getenv("USE_HIP_CALL"))
+    {
+      printf("[WARN] USE_HIP_CALL has been deprecated.  Please use DMA executor 'D' or set USE_GPU_DMA for P2P-Benchmark preset\n");
+      exit(1);
+    }
+
+    char* enableSdma = getenv("HSA_ENABLE_SDMA");
+    if (enableSdma && !strcmp(enableSdma, "0"))
+    {
+      printf("[WARN] DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0.  Copies will fallback to blit kernels\n");
+    }
+  }
+
+  // Display info on the env vars that can be used
+  static void DisplayUsage()
+  {
+    printf("Environment variables:\n");
+    printf("======================\n");
+    printf(" BLOCK_BYTES=B          - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
+    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
+    printf(" CONTINUE_ON_ERROR      - Continue tests even after mismatch detected\n");
+    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
+    printf(" NUM_GPU_DEVICES=X      - Restrict number of GPUs to X.  May not be greater than # detected HIP devices\n");
+    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
+    printf(" NUM_WARMUPS=W          - Perform W untimed warmup iteration(s) per test\n");
+    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
+    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" SHARED_MEM_BYTES=X     - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
+    printf(" USE_INTERACTIVE        - Pause for user-input before starting transfer loop\n");
+    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
+    printf(" USE_PREP_KERNEL        - Use GPU kernel to initialize source data array pattern\n");
+    printf(" USE_SINGLE_STREAM      - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
+    printf(" VALIDATE_DIRECT        - Validate GPU destination memory directly instead of staging GPU memory on host\n");
+  }
+
+  // Helper macro to switch between CSV and terminal output
+#define PRINT_EV(NAME, VALUE, DESCRIPTION)                              \
+  printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
+
+#define PRINT_ES(NAME, VALUE, DESCRIPTION)                           \
+  printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
+
+  // Display env var settings
+  void DisplayEnvVars() const
+  {
+    if (!outputToCsv)
+    {
+      printf("TransferBench v%s\n", TB_VERSION);
+      printf("=====================================================\n");
+      printf("[Common]\n");
+    }
+    else
+      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
+
+    PRINT_EV("BLOCK_BYTES", blockBytes,
+             std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
+    PRINT_EV("BYTE_OFFSET", byteOffset,
+             std::string("Using byte offset of " + std::to_string(byteOffset)));
+    PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
+             std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
+    PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
+             (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
+    PRINT_EV("GPU_KERNEL", gpuKernel,
+             std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]");
+    PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
+             std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
+    PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
+             std::string("Using ") + std::to_string(numGpuDevices) + " GPU devices");
+    PRINT_EV("NUM_ITERATIONS", numIterations,
+             std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
+             + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
+    PRINT_EV("NUM_WARMUPS", numWarmups,
+             std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
+    PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
+             std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
+    PRINT_EV("USE_INTERACTIVE", useInteractive,
+             std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
+    PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
+             std::string("Use ") + (usePcieIndexing ? "PCIe" : "HIP") + " GPU device indexing");
+    PRINT_EV("USE_PREP_KERNEL", usePrepSrcKernel,
+             std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
+    PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
+             std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
+    PRINT_EV("VALIDATE_DIRECT", validateDirect,
+             std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
+    printf("\n");
+  };
+
+  // Display env var for P2P Benchmark preset
+  void DisplayP2PBenchmarkEnvVars() const
+  {
+    DisplayEnvVars();
+
+    if (!outputToCsv)
+      printf("[P2P Related]\n");
+
+    PRINT_EV("NUM_CPU_SE", numCpuSubExecs,
+             std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
+    PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
+             std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
+    PRINT_EV("USE_GPU_DMA", useDmaCopy,
+             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
+    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
+             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
+    printf("\n");
+  }
+
+  // Display env var settings
+  void DisplaySweepEnvVars() const
+  {
+    DisplayEnvVars();
+
+    if (!outputToCsv)
+      printf("[Sweep Related]\n");
+    PRINT_ES("SWEEP_DST", sweepDst.c_str(),
+             std::string("Destination Memory Types to sweep"));
+    PRINT_ES("SWEEP_EXE", sweepExe.c_str(),
+             std::string("Executor Types to sweep"));
+    PRINT_EV("SWEEP_MAX", sweepMax,
+             std::string("Max simultaneous transfers (0 = no limit)"));
+    PRINT_EV("SWEEP_MIN", sweepMin,
+             std::string("Min simultaenous transfers"));
+    PRINT_EV("SWEEP_RAND_BYTES", sweepRandBytes,
+             std::string("Using ") + (sweepRandBytes ? "random" : "constant") + " number of bytes per Transfer");
+    PRINT_EV("SWEEP_SEED", sweepSeed,
+             std::string("Random seed set to ") + std::to_string(sweepSeed));
+    PRINT_ES("SWEEP_SRC", sweepSrc.c_str(),
+             std::string("Source Memory Types to sweep"));
+    PRINT_EV("SWEEP_TEST_LIMIT", sweepTestLimit,
+             std::string("Max number of tests to run during sweep (0 = no limit)"));
+    PRINT_EV("SWEEP_TIME_LIMIT", sweepTimeLimit,
+             std::string("Max number of seconds to run sweep for  (0 = no limit)"));
+    PRINT_EV("SWEEP_XGMI_MAX", sweepXgmiMax,
+             std::string("Max number of XGMI hops for Transfers (-1 = no limit)"));
+    PRINT_EV("SWEEP_XGMI_MIN", sweepXgmiMin,
+             std::string("Min number of XGMI hops for Transfers"));
+    printf("\n");
+  }
+
+  // Helper function that gets parses environment variable or sets to default value
+  static int GetEnvVar(std::string const& varname, int defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return atoi(getenv(varname.c_str()));
+    return defaultValue;
+  }
+
+  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return getenv(varname.c_str());
+    return defaultValue;
+  }
+};
+
+#endif
diff --git a/tfb.so/include/GetClosestNumaNode.hpp b/tfb.so/include/GetClosestNumaNode.hpp
new file mode 100644
index 00000000..266f1eb1
--- /dev/null
+++ b/tfb.so/include/GetClosestNumaNode.hpp
@@ -0,0 +1,149 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Helper macro for checking HSA calls
+#define HSA_CHECK(cmd)                                                  \
+  do {                                                                  \
+    hsa_status_t error = (cmd);                                         \
+    if (error != HSA_STATUS_SUCCESS) {                                  \
+      const char* errString = NULL;                                     \
+      hsa_status_string(error, &errString);                             \
+      std::cerr << "Encountered HSA error (" << errString << ") at line " \
+                << __LINE__ << " in file " << __FILE__ << "\n";         \
+      exit(-1);                                                         \
+    }                                                                   \
+  } while (0)
+
+// Structure to hold HSA agent information
+#if !defined(__NVCC__)
+struct AgentData
+{
+  bool isInitialized;
+  std::vector<hsa_agent_t> cpuAgents;
+  std::vector<hsa_agent_t> gpuAgents;
+  std::vector<int> closestNumaNode;
+};
+
+// Simple callback function to return any memory pool for an agent
+hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)
+{
+  hsa_amd_memory_pool_t* poolData = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
+
+  // Check memory pool flags
+  uint32_t poolFlags;
+  HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags));
+
+  // Only consider coarse-grained pools
+  if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS;
+
+  *poolData = pool;
+  return HSA_STATUS_SUCCESS;
+}
+
+// Callback function to gather HSA agent information
+hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data)
+{
+  AgentData* agentData = reinterpret_cast<AgentData*>(data);
+
+  // Get the device type
+  hsa_device_type_t deviceType;
+  HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType));
+  if (deviceType == HSA_DEVICE_TYPE_CPU)
+    agentData->cpuAgents.push_back(agent);
+  if (deviceType == HSA_DEVICE_TYPE_GPU)
+  {
+    agentData->gpuAgents.push_back(agent);
+    agentData->closestNumaNode.push_back(0);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+AgentData& GetAgentData()
+{
+  static AgentData agentData = {};
+
+  if (!agentData.isInitialized)
+  {
+    agentData.isInitialized = true;
+
+    // Add all detected agents to the list
+    HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
+
+    // Loop over each GPU
+    for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++)
+    {
+      // Collect memory pool
+      hsa_amd_memory_pool_t pool;
+      HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
+
+      // Loop over each CPU agent and check distance
+      int bestDistance = -1;
+      for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++)
+      {
+        // Determine number of hops from GPU memory pool to CPU agent
+        uint32_t hops = 0;
+        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
+                                                     pool,
+                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
+                                                     &hops));
+        // Gather link info
+        hsa_amd_memory_pool_link_info_t* link_info =
+          (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
+        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
+                                                     pool,
+                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
+                                                     link_info));
+        int numaDist = 0;
+        for (int k = 0; k < hops; k++)
+        {
+          numaDist += link_info[k].numa_distance;
+        }
+        if (bestDistance == -1 || numaDist < bestDistance)
+        {
+          agentData.closestNumaNode[i] = j;
+          bestDistance = numaDist;
+        }
+        free(link_info);
+      }
+    }
+  }
+  return agentData;
+}
+#endif
+
+// Returns closest CPU NUMA node to provided GPU
+// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing
+int GetClosestNumaNode(int gpuIdx)
+{
+#if defined(__NVCC__)
+  return -1;
+#else
+  AgentData& agentData = GetAgentData();
+  if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size())
+  {
+    printf("[ERROR] GPU index out is out of bounds\n");
+    exit(1);
+  }
+  return agentData.closestNumaNode[gpuIdx];
+#endif
+}
diff --git a/tfb.so/include/Kernels.hpp b/tfb.so/include/Kernels.hpp
new file mode 100644
index 00000000..dcb6bf07
--- /dev/null
+++ b/tfb.so/include/Kernels.hpp
@@ -0,0 +1,409 @@
+/*
+Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#define PackedFloat_t   float4
+#define WARP_SIZE       64
+#define BLOCKSIZE       256
+#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
+#define MEMSET_CHAR     75
+#define MEMSET_VAL      13323083.0f
+
+// Each subExecutor is provided with subarrays to work on
+#define MAX_SRCS 16
+#define MAX_DSTS 16
+struct SubExecParam
+{
+  size_t    N;                                  // Number of floats this subExecutor works on
+  int       numSrcs;                            // Number of source arrays
+  int       numDsts;                            // Number of destination arrays
+  float*    src[MAX_SRCS];                      // Source array pointers
+  float*    dst[MAX_DSTS];                      // Destination array pointers
+  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
+  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
+};
+
+void CpuReduceKernel(SubExecParam const& p)
+{
+  int const& numSrcs = p.numSrcs;
+  int const& numDsts = p.numDsts;
+
+  if (numSrcs == 0)
+  {
+    for (int i = 0; i < numDsts; ++i)
+      memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
+  }
+  else if (numSrcs == 1)
+  {
+    float const* __restrict__ src = p.src[0];
+    for (int i = 0; i < numDsts; ++i)
+    {
+      memcpy(p.dst[i], src, p.N * sizeof(float));
+    }
+  }
+  else
+  {
+    for (int j = 0; j < p.N; j++)
+    {
+      float sum = p.src[0][j];
+      for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
+      for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
+    }
+  }
+}
+
+std::string PrepSrcValueString()
+{
+  return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
+}
+
+__host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
+{
+  return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
+}
+
+// GPU kernel to prepare src buffer data
+__global__ void
+PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
+{
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < N;
+       idx += blockDim.x * gridDim.x)
+  {
+    ptr[idx] = PrepSrcValue(srcBufferIdx, idx);
+  }
+}
+
+// Helper function for memset
+template <typename T> __device__ __forceinline__ T      MemsetVal();
+template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
+template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
+
+// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
+template <int LOOP1_UNROLL>
+__global__ void __launch_bounds__(BLOCKSIZE)
+GpuReduceKernel(SubExecParam* params)
+{
+  int64_t startCycle = wall_clock64();
+
+  // Operate on wavefront granularity
+  SubExecParam& p    = params[blockIdx.x];
+  int const numSrcs  = p.numSrcs;
+  int const numDsts  = p.numDsts;
+  int const waveId   = threadIdx.x / WARP_SIZE; // Wavefront number
+  int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
+
+  // 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration
+  // Determine the number of packed floats processed by the first loop
+  size_t       Nrem        = p.N;
+  size_t const loop1Npack  = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
+  size_t const loop1Nelem  = loop1Npack * FLOATS_PER_PACK;
+  size_t const loop1Inc    = BLOCKSIZE * LOOP1_UNROLL;
+  size_t       loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
+
+  while (loop1Offset < loop1Npack)
+  {
+    PackedFloat_t vals[LOOP1_UNROLL] = {};
+
+    if (numSrcs == 0)
+    {
+      #pragma unroll
+      for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal<float4>();
+    }
+    else
+    {
+      for (int i = 0; i < numSrcs; ++i)
+      {
+        PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset;
+        #pragma unroll
+        for (int u = 0; u < LOOP1_UNROLL; ++u)
+          vals[u] += *(packedSrc + u * WARP_SIZE);
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset;
+      #pragma unroll
+      for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u];
+    }
+    loop1Offset += loop1Inc;
+  }
+  Nrem -= loop1Nelem;
+
+  if (Nrem > 0)
+  {
+    // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
+    // NOTE: Using int32_t due to smaller size requirements
+    int32_t const loop2Npack  = Nrem / FLOATS_PER_PACK;
+    int32_t const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
+    int32_t const loop2Inc    = BLOCKSIZE;
+    int32_t       loop2Offset = threadIdx.x;
+
+    while (loop2Offset < loop2Npack)
+    {
+      PackedFloat_t val;
+      if (numSrcs == 0)
+      {
+        val = MemsetVal<float4>();
+      }
+      else
+      {
+        val = {};
+        for (int i = 0; i < numSrcs; ++i)
+        {
+          PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset;
+          val += *packedSrc;
+        }
+      }
+
+      for (int i = 0; i < numDsts; ++i)
+      {
+        PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset;
+        *packedDst = val;
+      }
+      loop2Offset += loop2Inc;
+    }
+    Nrem -= loop2Nelem;
+
+    // Deal with leftovers less than FLOATS_PER_PACK)
+    if (threadIdx.x < Nrem)
+    {
+      int offset = loop1Nelem + loop2Nelem + threadIdx.x;
+      float val = 0;
+      if (numSrcs == 0)
+      {
+        val = MEMSET_VAL;
+      }
+      else
+      {
+        for (int i = 0; i < numSrcs; ++i)
+          val += p.src[i][offset];
+      }
+
+      for (int i = 0; i < numDsts; ++i)
+        p.dst[i][offset] = val;
+    }
+  }
+
+  __syncthreads();
+  if (threadIdx.x == 0)
+  {
+    p.startCycle = startCycle;
+    p.stopCycle  = wall_clock64();
+  }
+}
+
+template <typename FLOAT_TYPE, int UNROLL_FACTOR>
+__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
+{
+  int    constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
+  size_t constexpr loopPackInc      = BLOCKSIZE * UNROLL_FACTOR;
+  size_t constexpr numPacksPerWave  = WARP_SIZE * UNROLL_FACTOR;
+  int    const     waveId           = threadIdx.x / WARP_SIZE;            // Wavefront number
+  int    const     threadId         = threadIdx.x % WARP_SIZE;            // Thread index within wavefront
+  int    const     numSrcs          = p.numSrcs;
+  int    const     numDsts          = p.numDsts;
+  size_t const     numPacksDone     = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave;
+  size_t const     numFloatsLeft    = N - numPacksDone * numFloatsPerPack;
+  size_t           loopPackOffset   = waveId * numPacksPerWave + threadId;
+
+  while (loopPackOffset < numPacksDone)
+  {
+    FLOAT_TYPE vals[UNROLL_FACTOR];
+
+    if (numSrcs == 0)
+    {
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal<FLOAT_TYPE>();
+    }
+    else
+    {
+      FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset;
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        vals[u] = *(src0Ptr + u * WARP_SIZE);
+
+      for (int i = 1; i < numSrcs; ++i)
+      {
+        FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset;
+
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          vals[u] += *(srcPtr + u * WARP_SIZE);
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset;
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        *(dstPtr + u * WARP_SIZE) = vals[u];
+    }
+    loopPackOffset += loopPackInc;
+  }
+
+  return numFloatsLeft;
+}
+
+template <typename FLOAT_TYPE, int UNROLL_FACTOR>
+__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N)
+{
+  // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
+  int    constexpr numFloatsPerRead      = sizeof(FLOAT_TYPE) / sizeof(float);
+  size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead;
+  size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
+  size_t const     numFloatsLeft         = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
+  size_t const     numFloatsDone         = N - numFloatsLeft;
+  int    const     numSrcs               = p.numSrcs;
+  int    const     numDsts               = p.numDsts;
+
+  for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop)
+  {
+    FLOAT_TYPE tmp[UNROLL_FACTOR];
+
+    if (numSrcs == 0)
+    {
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          tmp[u] = MemsetVal<FLOAT_TYPE>();
+    }
+    else
+    {
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop]));
+
+      for (int i = 1; i < numSrcs; ++i)
+      {
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop]));
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+      {
+        *((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u];
+      }
+    }
+  }
+  return numFloatsLeft;
+}
+
+template <typename FLOAT_TYPE>
+__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
+{
+  switch (unroll)
+  {
+  case  1: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
+  case  2: return GpuReduceFuncImpl<FLOAT_TYPE,  2>(p, offset, N);
+  case  3: return GpuReduceFuncImpl<FLOAT_TYPE,  3>(p, offset, N);
+  case  4: return GpuReduceFuncImpl<FLOAT_TYPE,  4>(p, offset, N);
+  case  5: return GpuReduceFuncImpl<FLOAT_TYPE,  5>(p, offset, N);
+  case  6: return GpuReduceFuncImpl<FLOAT_TYPE,  6>(p, offset, N);
+  case  7: return GpuReduceFuncImpl<FLOAT_TYPE,  7>(p, offset, N);
+  case  8: return GpuReduceFuncImpl<FLOAT_TYPE,  8>(p, offset, N);
+  case  9: return GpuReduceFuncImpl<FLOAT_TYPE,  9>(p, offset, N);
+  case 10: return GpuReduceFuncImpl<FLOAT_TYPE, 10>(p, offset, N);
+  case 11: return GpuReduceFuncImpl<FLOAT_TYPE, 11>(p, offset, N);
+  case 12: return GpuReduceFuncImpl<FLOAT_TYPE, 12>(p, offset, N);
+  case 13: return GpuReduceFuncImpl<FLOAT_TYPE, 13>(p, offset, N);
+  case 14: return GpuReduceFuncImpl<FLOAT_TYPE, 14>(p, offset, N);
+  case 15: return GpuReduceFuncImpl<FLOAT_TYPE, 15>(p, offset, N);
+  case 16: return GpuReduceFuncImpl<FLOAT_TYPE, 16>(p, offset, N);
+  default: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
+  }
+}
+
+// GPU copy kernel
+__global__ void __launch_bounds__(BLOCKSIZE)
+GpuReduceKernel2(SubExecParam* params)
+{
+  int64_t startCycle = wall_clock64();
+  SubExecParam& p = params[blockIdx.x];
+
+  size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
+  if (numFloatsLeft)
+    numFloatsLeft = GpuReduceFunc<float4>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
+
+  if (numFloatsLeft)
+  GpuReduceFunc<float>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
+
+  __threadfence_system();
+  if (threadIdx.x == 0)
+  {
+    p.startCycle = startCycle;
+    p.stopCycle  = wall_clock64();
+  }
+}
+
+#define NUM_GPU_KERNELS 18
+typedef void (*GpuKernelFuncPtr)(SubExecParam*);
+
+GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] =
+{
+  GpuReduceKernel<8>,
+  GpuReduceKernel<1>,
+  GpuReduceKernel<2>,
+  GpuReduceKernel<3>,
+  GpuReduceKernel<4>,
+  GpuReduceKernel<5>,
+  GpuReduceKernel<6>,
+  GpuReduceKernel<7>,
+  GpuReduceKernel<8>,
+  GpuReduceKernel<9>,
+  GpuReduceKernel<10>,
+  GpuReduceKernel<11>,
+  GpuReduceKernel<12>,
+  GpuReduceKernel<13>,
+  GpuReduceKernel<14>,
+  GpuReduceKernel<15>,
+  GpuReduceKernel<16>,
+  GpuReduceKernel2
+};
+
+std::string GpuKernelNames[NUM_GPU_KERNELS] =
+{
+  "Default - 8xUnroll",
+  "Unroll x1",
+  "Unroll x2",
+  "Unroll x3",
+  "Unroll x4",
+  "Unroll x5",
+  "Unroll x6",
+  "Unroll x7",
+  "Unroll x8",
+  "Unroll x9",
+  "Unroll x10",
+  "Unroll x11",
+  "Unroll x12",
+  "Unroll x13",
+  "Unroll x14",
+  "Unroll x15",
+  "Unroll x16",
+  "8xUnrollB",
+};
diff --git a/tfb.so/include/TransferBench.hpp b/tfb.so/include/TransferBench.hpp
new file mode 100644
index 00000000..6abbbc5b
--- /dev/null
+++ b/tfb.so/include/TransferBench.hpp
@@ -0,0 +1,198 @@
+/*
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <vector>
+#include <sstream>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstdint>
+#include <set>
+#include <unistd.h>
+#include <map>
+#include <iostream>
+#include <sstream>
+
+#include "Compatibility.hpp"
+
+// Helper macro for catching HIP errors
+#define HIP_CALL(cmd)                                                                   \
+    do {                                                                                \
+        hipError_t error = (cmd);                                                       \
+        if (error != hipSuccess)                                                        \
+        {                                                                               \
+            std::cerr << "Encountered HIP error (" << hipGetErrorString(error)          \
+                      << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
+            exit(-1);                                                                   \
+        }                                                                               \
+    } while (0)
+
+#include "EnvVars.hpp"
+
+// Simple configuration parameters
+size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer
+
+// Different src/dst memory types supported
+typedef enum
+{
+  MEM_CPU          = 0, // Coarse-grained pinned CPU memory
+  MEM_GPU          = 1, // Coarse-grained global GPU memory
+  MEM_CPU_FINE     = 2, // Fine-grained pinned CPU memory
+  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
+  MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
+  MEM_NULL         = 5, // NULL memory - used for empty
+} MemType;
+
+typedef enum
+{
+  EXE_CPU          = 0, // CPU executor              (subExecutor = CPU thread)
+  EXE_GPU_GFX      = 1, // GPU kernel-based executor (subExecutor = threadblock/CU)
+  EXE_GPU_DMA      = 2, // GPU SDMA-based executor   (subExecutor = streams)
+} ExeType;
+
+bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); }
+bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
+bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
+bool IsCpuType(ExeType e) { return (e == EXE_CPU); };
+
+char const MemTypeStr[7] = "CGBFUN";
+char const ExeTypeStr[4] = "CGD";
+char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
+
+MemType inline CharToMemType(char const c)
+{
+  char const* val = strchr(MemTypeStr, toupper(c));
+  if (*val) return (MemType)(val - MemTypeStr);
+  printf("[ERROR] Unexpected memory type (%c)\n", c);
+  exit(1);
+}
+
+ExeType inline CharToExeType(char const c)
+{
+  char const* val = strchr(ExeTypeStr, toupper(c));
+  if (*val) return (ExeType)(val - ExeTypeStr);
+  printf("[ERROR] Unexpected executor type (%c)\n", c);
+  exit(1);
+}
+
+// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified)
+// then writes the summation to each of the specified destination memory location(s)
+struct Transfer
+{
+  int                       transferIndex;      // Transfer identifier (within a Test)
+  ExeType                   exeType;            // Transfer executor type
+  int                       exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
+  int                       numSubExecs;        // Number of subExecutors to use for this Transfer
+  size_t                    numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
+  size_t                    numBytesActual;     // Actual number of bytes to copy
+  double                    transferTime;       // Time taken in milliseconds
+
+  int                       numSrcs;            // Number of sources
+  std::vector<MemType>      srcType;            // Source memory types
+  std::vector<int>          srcIndex;           // Source device indice
+  std::vector<float*>       srcMem;             // Source memory
+
+  int                       numDsts;            // Number of destinations
+  std::vector<MemType>      dstType;            // Destination memory type
+  std::vector<int>          dstIndex;           // Destination device index
+  std::vector<float*>       dstMem;             // Destination memory
+
+  std::vector<SubExecParam> subExecParam;       // Defines subarrays assigned to each threadblock
+  SubExecParam*             subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
+
+  // Prepares src/dst subarray pointers for each SubExecutor
+  void PrepareSubExecParams(EnvVars const& ev);
+
+  // Prepare source arrays with input data
+  bool PrepareSrc(EnvVars const& ev);
+
+  // Validate that destination data contains expected results
+  void ValidateDst(EnvVars const& ev);
+
+  // Prepare reference buffers
+  void PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx);
+
+  // String representation functions
+  std::string SrcToStr() const;
+  std::string DstToStr() const;
+};
+
+struct ExecutorInfo
+{
+  std::vector<Transfer*>   transfers;        // Transfers to execute
+  size_t                   totalBytes;       // Total bytes this executor transfers
+  int                      totalSubExecs;    // Total number of subExecutors to use
+
+  // For GPU-Executors
+  SubExecParam*            subExecParamGpu;  // GPU copy of subExecutor parameters
+  std::vector<hipStream_t> streams;
+  std::vector<hipEvent_t>  startEvents;
+  std::vector<hipEvent_t>  stopEvents;
+
+  // Results
+  double totalTime;
+};
+
+typedef std::pair<ExeType, int> Executor;
+typedef std::map<Executor, ExecutorInfo> TransferMap;
+
+// Display usage instructions
+void DisplayUsage(char const* cmdName);
+
+// Display detected GPU topology / CPU numa nodes
+void DisplayTopology(bool const outputToCsv);
+
+// Build array of test sizes based on sampling factor
+void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor,
+                       std::vector<size_t>& valuesofN);
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
+                  std::vector<MemType>& memType, std::vector<int>& memIndex);
+void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
+                  ExeType& exeType, int& exeIndex);
+
+void ParseTransfers(char* line, int numCpus, int numGpus,
+                    std::vector<Transfer>& transfers);
+
+void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
+                      std::vector<Transfer>& transfers, bool verbose = true);
+
+void EnablePeerAccess(int const deviceId, int const peerDeviceId);
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
+void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
+void CheckPages(char* byteArray, size_t numBytes, int targetId);
+void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
+
+// Return the maximum bandwidth measured for given (src/dst) pair
+double GetPeakBandwidth(EnvVars const& ev, size_t  const  N,
+                        int     const  isBidirectional,
+                        MemType const  srcType, int const srcIndex,
+                        MemType const  dstType, int const dstIndex);
+
+std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
+
+int RemappedIndex(int const origIdx, bool const isCpuType);
+int GetWallClockRate(int deviceId);
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
+std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);
diff --git a/tfb.so/include/action.h b/tfb.so/include/action.h
new file mode 100644
index 00000000..7fdf2b35
--- /dev/null
+++ b/tfb.so/include/action.h
@@ -0,0 +1,86 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTEST_SO_INCLUDE_ACTION_H_
+#define HIPTEST_SO_INCLUDE_ACTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <pci/pci.h>
+#ifdef __cplusplus
+}
+#endif
+
+#include <vector>
+#include <string>
+#include <map>
+
+#include "include/rvsactionbase.h"
+
+using std::vector;
+using std::string;
+using std::map;
+
+/**
+ * @class tfb_action
+ * @ingroup GST
+ *
+ * @brief GST action implementation class
+ *
+ * Derives from rvs::actionbase and implements actual action functionality
+ * in its run() method.
+ *
+ */
+class tfb_action: public rvs::actionbase {
+ public:
+    tfb_action();
+    virtual ~tfb_action();
+
+    virtual int run(void);
+
+    std::string m_transfer_type;
+
+ protected:
+    //! TRUE if JSON output is required
+    bool bjson;
+
+    bool get_all_tfb_config_keys(void);
+  /**
+  * @brief reads all common configuration keys from
+  * the module's properties collection
+  * @return true if no fatal error occured, false otherwise
+  */
+    bool get_all_common_config_keys(void);
+
+    int get_num_amd_gpu_devices(void);
+    bool start_tfb_runners();
+  /**
+  * @brief gets the number of ROCm compatible AMD GPUs
+  * @return run number of GPUs
+  */
+    int run_transferbench(void);
+};
+
+#endif  // HIPTEST_SO_INCLUDE_ACTION_H_
diff --git a/tfb.so/include/rvs_module.h b/tfb.so/include/rvs_module.h
new file mode 100644
index 00000000..9edaec65
--- /dev/null
+++ b/tfb.so/include/rvs_module.h
@@ -0,0 +1,31 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTEST_SO_INCLUDE_RVS_MODULE_H_
+#define HIPTEST_SO_INCLUDE_RVS_MODULE_H_
+
+#include "include/rvsliblog.h"
+
+
+#endif  // HIPTEST_SO_INCLUDE_RVS_MODULE_H_
diff --git a/tfb.so/include/tfb_worker.h b/tfb.so/include/tfb_worker.h
new file mode 100644
index 00000000..10b1cfac
--- /dev/null
+++ b/tfb.so/include/tfb_worker.h
@@ -0,0 +1,72 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
+#define HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
+
+#include <string>
+#include <memory>
+#include "include/rvsthreadbase.h"
+#include "include/rvs_blas.h"
+#include "include/rvs_util.h"
+
+#define GST_RESULT_PASS_MESSAGE         "true"
+#define GST_RESULT_FAIL_MESSAGE         "false"
+
+
+/**
+ * @class tfbWorker
+ * @ingroup GST
+ *
+ * @brief tfbWorker action implementation class
+ *
+ * Derives from rvs::ThreadBase and implements actual action functionality
+ * in its run() method.
+ *
+ */
+class tfbWorker : public rvs::ThreadBase {
+ public:
+    tfbWorker();
+    virtual ~tfbWorker();
+
+    //! sets action name
+    void set_name(const std::string& name) { action_name = name; }
+    //! returns action name
+    const std::string& get_name(void) { return action_name; }
+
+    //! sets test path
+    void set_transfer_type(std::string type) { m_tfr_type = type; }
+
+    const std::string& get_transfer_type(void) { return m_tfr_type; }
+    int TfbRun(int &error, std::string &errdesc);
+ protected:
+    virtual void run(void);
+ protected:
+    //! name of the action
+    std::string action_name;
+    //! path to execute test
+    std::string m_tfr_type;
+};
+
+#endif  // HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_
diff --git a/tfb.so/src/.gitignore b/tfb.so/src/.gitignore
new file mode 100644
index 00000000..6677c873
--- /dev/null
+++ b/tfb.so/src/.gitignore
@@ -0,0 +1 @@
+/libmain.cpp
diff --git a/tfb.so/src/TransferBench.cpp b/tfb.so/src/TransferBench.cpp
new file mode 100644
index 00000000..51c7f68f
--- /dev/null
+++ b/tfb.so/src/TransferBench.cpp
@@ -0,0 +1,1814 @@
+/*
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// This program measures simultaneous copy performance across multiple GPUs
+// on the same node
+#include <numa.h>
+#include <numaif.h>
+#include <random>
+#include <stack>
+#include <thread>
+
+#include "TransferBench.hpp"
+#include "GetClosestNumaNode.hpp"
+
+
+
+void ExecuteTransfers(EnvVars const& ev,
+                      int const testNum,
+                      size_t const N,
+                      std::vector<Transfer>& transfers,
+                      bool verbose)
+{
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  // Map transfers by executor
+  TransferMap transferMap;
+  for (Transfer& transfer : transfers)
+  {
+    Executor executor(transfer.exeType, transfer.exeIndex);
+    ExecutorInfo& executorInfo = transferMap[executor];
+    executorInfo.transfers.push_back(&transfer);
+  }
+
+  // Loop over each executor and prepare sub-executors
+  std::map<int, Transfer*> transferList;
+  for (auto& exeInfoPair : transferMap)
+  {
+    Executor const& executor = exeInfoPair.first;
+    ExecutorInfo& exeInfo    = exeInfoPair.second;
+    ExeType const exeType    = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
+
+    exeInfo.totalTime = 0.0;
+    exeInfo.totalSubExecs = 0;
+
+    // Loop over each transfer this executor is involved in
+    for (Transfer* transfer : exeInfo.transfers)
+    {
+      // Determine how many bytes to copy for this Transfer (use custom if pre-specified)
+      transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));
+
+      // Allocate source memory
+      transfer->srcMem.resize(transfer->numSrcs);
+      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
+      {
+        MemType const& srcType  = transfer->srcType[iSrc];
+        int     const  srcIndex    = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
+
+        // Ensure executing GPU can access source memory
+        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex)
+          EnablePeerAccess(exeIndex, srcIndex);
+
+        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
+      }
+
+      // Allocate destination memory
+      transfer->dstMem.resize(transfer->numDsts);
+      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
+      {
+        MemType const& dstType  = transfer->dstType[iDst];
+        int     const  dstIndex    = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
+
+        // Ensure executing GPU can access destination memory
+        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex)
+          EnablePeerAccess(exeIndex, dstIndex);
+
+        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
+      }
+
+      exeInfo.totalSubExecs += transfer->numSubExecs;
+      transferList[transfer->transferIndex] = transfer;
+    }
+
+    // Prepare additional requirement for GPU-based executors
+    if (IsGpuType(exeType))
+    {
+      HIP_CALL(hipSetDevice(exeIndex));
+
+      // Single-stream is only supported for GFX-based executors
+      int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
+      exeInfo.streams.resize(numStreamsToUse);
+      exeInfo.startEvents.resize(numStreamsToUse);
+      exeInfo.stopEvents.resize(numStreamsToUse);
+      for (int i = 0; i < numStreamsToUse; ++i)
+      {
+        HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
+        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
+      }
+
+      if (exeType == EXE_GPU_GFX)
+      {
+        // Allocate one contiguous chunk of GPU memory for threadblock parameters
+        // This allows support for executing one transfer per stream, or all transfers in a single stream
+        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
+                       (void**)&exeInfo.subExecParamGpu);
+      }
+    }
+  }
+
+  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
+
+  // Prepare input memory and block parameters for current N
+  bool isSrcCorrect = true;
+  for (auto& exeInfoPair : transferMap)
+  {
+    Executor const& executor = exeInfoPair.first;
+    ExecutorInfo& exeInfo    = exeInfoPair.second;
+    ExeType const exeType    = executor.first;
+    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
+
+    exeInfo.totalBytes = 0;
+
+    int transferOffset = 0;
+    for (int i = 0; i < exeInfo.transfers.size(); ++i)
+    {
+      // Prepare subarrays each threadblock works on and fill src memory with patterned data
+      Transfer* transfer = exeInfo.transfers[i];
+      transfer->PrepareSubExecParams(ev);
+      isSrcCorrect &= transfer->PrepareSrc(ev);
+      exeInfo.totalBytes += transfer->numBytesActual;
+
+      // Copy block parameters to GPU for GPU executors
+      if (transfer->exeType == EXE_GPU_GFX)
+      {
+        exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
+        HIP_CALL(hipSetDevice(exeIndex));
+        HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset],
+                           transfer->subExecParam.data(),
+                           transfer->subExecParam.size() * sizeof(SubExecParam),
+                           hipMemcpyHostToDevice));
+        HIP_CALL(hipDeviceSynchronize());
+
+        transferOffset += transfer->subExecParam.size();
+      }
+    }
+  }
+
+  // Launch kernels (warmup iterations are not counted)
+  double totalCpuTime = 0;
+  size_t numTimedIterations = 0;
+  std::stack<std::thread> threads;
+  for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
+  {
+    if (ev.numIterations > 0 && iteration    >= ev.numIterations) break;
+    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
+
+    // Pause before starting first timed iteration in interactive mode
+    if (verbose && ev.useInteractive && iteration == 0)
+    {
+      printf("Memory prepared:\n");
+
+      for (Transfer& transfer : transfers)
+      {
+        printf("Transfer %03d:\n", transfer.transferIndex);
+        for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
+          printf("  SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
+        for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
+          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
+      }
+      printf("Hit <Enter> to continue: ");
+      if (scanf("%*c") != 0)
+      {
+        printf("[ERROR] Unexpected input\n");
+        exit(1);
+      }
+      printf("\n");
+    }
+
+    // Start CPU timing for this iteration
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Execute all Transfers in parallel
+    for (auto& exeInfoPair : transferMap)
+    {
+      ExecutorInfo& exeInfo = exeInfoPair.second;
+      ExeType       exeType = exeInfoPair.first.first;
+      int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();
+
+      for (int i = 0; i < numTransfersToRun; ++i)
+        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
+    }
+
+    // Wait for all threads to finish
+    int const numTransfers = threads.size();
+    for (int i = 0; i < numTransfers; i++)
+    {
+      threads.top().join();
+      threads.pop();
+    }
+
+    // Stop CPU timing for this iteration
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+
+    if (iteration >= 0)
+    {
+      ++numTimedIterations;
+      totalCpuTime += deltaSec;
+    }
+  }
+
+  // Pause for interactive mode
+  if (verbose && isSrcCorrect && ev.useInteractive)
+  {
+    printf("Transfers complete. Hit <Enter> to continue: ");
+    if (scanf("%*c") != 0)
+    {
+      printf("[ERROR] Unexpected input\n");
+      exit(1);
+    }
+    printf("\n");
+  }
+
+  // Validate that each transfer has transferred correctly
+  size_t totalBytesTransferred = 0;
+  int const numTransfers = transferList.size();
+  for (auto transferPair : transferList)
+  {
+    Transfer* transfer = transferPair.second;
+    transfer->ValidateDst(ev);
+    totalBytesTransferred += transfer->numBytesActual;
+  }
+
+  // Report timings
+  totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
+  double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
+  double maxGpuTime = 0;
+
+  if (!isSrcCorrect) goto cleanup;
+  if (ev.useSingleStream)
+  {
+    for (auto& exeInfoPair : transferMap)
+    {
+      ExecutorInfo  exeInfo  = exeInfoPair.second;
+      ExeType const exeType  = exeInfoPair.first.first;
+      int     const exeIndex = exeInfoPair.first.second;
+
+      // Compute total time for non GPU executors
+      if (exeType != EXE_GPU_GFX)
+      {
+        exeInfo.totalTime = 0;
+        for (auto const& transfer : exeInfo.transfers)
+          exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
+      }
+
+      double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
+      double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
+
+      if (verbose && !ev.outputToCsv)
+      {
+        printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n",
+               ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
+      }
+
+      int totalCUs = 0;
+      for (auto const& transfer : exeInfo.transfers)
+      {
+        double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+        double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+        totalCUs += transfer->numSubExecs;
+
+        if (!verbose) continue;
+        if (!ev.outputToCsv)
+        {
+          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+                 transfer->transferIndex,
+                 transferBandwidthGbs,
+                 transferDurationMsec,
+                 transfer->numBytesActual,
+                 transfer->SrcToStr().c_str(),
+                 ExeTypeName[transfer->exeType], transfer->exeIndex,
+                 transfer->numSubExecs,
+                 transfer->DstToStr().c_str());
+        }
+        else
+        {
+          printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+                 testNum, transfer->transferIndex, transfer->numBytesActual,
+                 transfer->SrcToStr().c_str(),
+                 MemTypeStr[transfer->exeType], transfer->exeIndex,
+                 transfer->DstToStr().c_str(),
+                 transfer->numSubExecs,
+                 transferBandwidthGbs, transferDurationMsec,
+                 PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
+                 PtrVectorToStr(transfer->dstMem, initOffset).c_str());
+        }
+      }
+
+      if (verbose && ev.outputToCsv)
+      {
+        printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n",
+               testNum, totalBytesTransferred,
+               MemTypeStr[exeType], exeIndex, totalCUs,
+               exeBandwidthGbs, exeDurationMsec);
+      }
+    }
+  }
+  else
+  {
+    for (auto const& transferPair : transferList)
+    {
+      Transfer* transfer = transferPair.second;
+      double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
+      double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+      maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
+      if (!verbose) continue;
+      if (!ev.outputToCsv)
+      {
+        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+               transfer->transferIndex,
+               transferBandwidthGbs, transferDurationMsec,
+               transfer->numBytesActual,
+               transfer->SrcToStr().c_str(),
+               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               transfer->numSubExecs,
+               transfer->DstToStr().c_str());
+      }
+      else
+      {
+        printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+               testNum, transfer->transferIndex, transfer->numBytesActual,
+               transfer->SrcToStr().c_str(),
+               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               transfer->DstToStr().c_str(),
+               transfer->numSubExecs,
+               transferBandwidthGbs, transferDurationMsec,
+               PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
+               PtrVectorToStr(transfer->dstMem, initOffset).c_str());
+      }
+    }
+  }
+
+  // Display aggregate statistics
+  if (verbose)
+  {
+    if (!ev.outputToCsv)
+    {
+      printf(" Aggregate (CPU)  | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
+             totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
+    }
+    else
+    {
+      printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n",
+             testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
+    }
+  }
+
+  // Release GPU memory
+cleanup:
+  for (auto exeInfoPair : transferMap)
+  {
+    ExecutorInfo& exeInfo  = exeInfoPair.second;
+    ExeType const exeType  = exeInfoPair.first.first;
+    int     const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));
+
+    for (auto& transfer : exeInfo.transfers)
+    {
+      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
+      {
+        MemType const& srcType = transfer->srcType[iSrc];
+        DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
+      }
+      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
+      {
+        MemType const& dstType = transfer->dstType[iDst];
+        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
+      }
+      transfer->subExecParam.clear();
+    }
+
+    if (IsGpuType(exeType))
+    {
+      int const numStreams = (int)exeInfo.streams.size();
+      for (int i = 0; i < numStreams; ++i)
+      {
+        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
+        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
+        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
+      }
+
+      if (exeType == EXE_GPU_GFX)
+      {
+        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
+      }
+    }
+  }
+}
+
+void DisplayUsage(char const* cmdName)
+{
+  printf("TransferBench v%s\n", TB_VERSION);
+  printf("========================================\n");
+
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+  int const numCpuDevices = numa_num_configured_nodes();
+
+  printf("Usage: %s config <N>\n", cmdName);
+  printf("  config: Either:\n");
+  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
+  printf("          - Name of preset config:\n");
+  printf("              p2p          - Peer-to-peer benchmark tests\n");
+  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
+  printf("                             - 3rd/4th optional args for # GPU SubExecs / # CPU SubExecs per Transfer\n");
+  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
+  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
+         DEFAULT_BYTES_PER_TRANSFER);
+  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
+  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
+  printf("\n");
+
+  EnvVars::DisplayUsage();
+}
+
+int RemappedIndex(int const origIdx, bool const isCpuType)
+{
+  static std::vector<int> remappingCpu;
+  static std::vector<int> remappingGpu;
+
+  // Build CPU remapping on first use
+  // Skip numa nodes that are not configured
+  if (remappingCpu.empty())
+  {
+    for (int node = 0; node <= numa_max_node(); node++)
+      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
+        remappingCpu.push_back(node);
+  }
+
+  // Build remappingGpu on first use
+  if (remappingGpu.empty())
+  {
+    int numGpuDevices;
+    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+    remappingGpu.resize(numGpuDevices);
+
+    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
+    if (!usePcieIndexing)
+    {
+      // For HIP-based indexing no remappingGpu is necessary
+      for (int i = 0; i < numGpuDevices; ++i)
+        remappingGpu[i] = i;
+    }
+    else
+    {
+      // Collect PCIe address for each GPU
+      std::vector<std::pair<std::string, int>> mapping;
+      char pciBusId[20];
+      for (int i = 0; i < numGpuDevices; ++i)
+      {
+        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
+        mapping.push_back(std::make_pair(pciBusId, i));
+      }
+      // Sort GPUs by PCIe address then use that as mapping
+      std::sort(mapping.begin(), mapping.end());
+      for (int i = 0; i < numGpuDevices; ++i)
+        remappingGpu[i] = mapping[i].second;
+    }
+  }
+  return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
+}
+
+void DisplayTopology(bool const outputToCsv)
+{
+
+  int numCpuDevices = numa_num_configured_nodes();
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+
+  if (outputToCsv)
+  {
+    printf("NumCpus,%d\n", numCpuDevices);
+    printf("NumGpus,%d\n", numGpuDevices);
+  }
+  else
+  {
+    printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total]   %d GPU device(s)\n",
+           numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
+  }
+
+  // Print out detected CPU topology
+  if (outputToCsv)
+  {
+    printf("NUMA");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf(",NUMA%02d", j);
+    printf(",# CPUs,ClosestGPUs,ActualNode\n");
+  }
+  else
+  {
+    printf("            |");
+    for (int j = 0; j < numCpuDevices; j++)
+      printf("NUMA %02d|", j);
+    printf(" #Cpus | Closest GPU(s)\n");
+
+    printf("------------+");
+    for (int j = 0; j <= numCpuDevices; j++)
+      printf("-------+");
+    printf("---------------\n");
+  }
+
+  for (int i = 0; i < numCpuDevices; i++)
+  {
+    int nodeI = RemappedIndex(i, true);
+    printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
+    for (int j = 0; j < numCpuDevices; j++)
+    {
+      int nodeJ = RemappedIndex(j, true);
+      int numaDist = numa_distance(nodeI, nodeJ);
+      if (outputToCsv)
+        printf("%d,", numaDist);
+      else
+        printf(" %5d |", numaDist);
+    }
+
+    int numCpus = 0;
+    for (int j = 0; j < numa_num_configured_cpus(); j++)
+      if (numa_node_of_cpu(j) == nodeI) numCpus++;
+    if (outputToCsv)
+      printf("%d,", numCpus);
+    else
+      printf(" %5d | ", numCpus);
+
+#if !defined(__NVCC__)
+    bool isFirst = true;
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
+      {
+        if (isFirst) isFirst = false;
+        else printf(",");
+        printf("%d", j);
+      }
+    }
+#endif
+    printf("\n");
+  }
+  printf("\n");
+
+#if defined(__NVCC__)
+  // No further topology detection done for NVIDIA platforms
+  return;
+#endif
+
+  // Print out detected GPU topology
+  if (outputToCsv)
+  {
+    printf("GPU");
+    for (int j = 0; j < numGpuDevices; j++)
+      printf(",GPU %02d", j);
+    printf(",PCIe Bus ID,ClosestNUMA\n");
+  }
+  else
+  {
+    printf("        |");
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, j));
+      std::string fullName = prop.gcnArchName;
+      std::string archName = fullName.substr(0, fullName.find(':'));
+      printf(" %6s |", archName.c_str());
+    }
+    printf("\n");
+    printf("        |");
+    for (int j = 0; j < numGpuDevices; j++)
+      printf(" GPU %02d |", j);
+    printf(" PCIe Bus ID  | #CUs | Closest NUMA\n");
+    for (int j = 0; j <= numGpuDevices; j++)
+      printf("--------+");
+    printf("--------------+------+-------------\n");
+  }
+
+#if !defined(__NVCC__)
+  char pciBusId[20];
+  for (int i = 0; i < numGpuDevices; i++)
+  {
+    int const deviceIdx = RemappedIndex(i, false);
+    printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (i == j)
+      {
+        if (outputToCsv)
+          printf("-,");
+        else
+          printf("    -   |");
+      }
+      else
+      {
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
+                                              RemappedIndex(j, false),
+                                              &linkType, &hopCount));
+        printf("%s%s-%d%s",
+               outputToCsv ? "" : " ",
+               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
+               hopCount, outputToCsv ? "," : " |");
+      }
+    }
+    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));
+
+    int numDeviceCUs = 0;
+    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
+
+    if (outputToCsv)
+      printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+    else
+      printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
+  }
+#endif
+}
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
+                  std::vector<MemType>& memTypes, std::vector<int>& memIndices)
+{
+  char typeChar;
+  int offset = 0, devIndex, inc;
+  bool found = false;
+
+  memTypes.clear();
+  memIndices.clear();
+  while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
+  {
+    offset += inc;
+    MemType memType = CharToMemType(typeChar);
+
+    if (IsCpuType(memType) && (devIndex < 0 || devIndex >= numCpus))
+    {
+      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, devIndex);
+      exit(1);
+    }
+    if (IsGpuType(memType) && (devIndex < 0 || devIndex >= numGpus))
+    {
+      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, devIndex);
+      exit(1);
+    }
+
+    found = true;
+    if (memType != MEM_NULL)
+    {
+      memTypes.push_back(memType);
+      memIndices.push_back(devIndex);
+    }
+  }
+  if (!found)
+  {
+    printf("[ERROR] Unable to parse memory type token %s.  Expected one of %s followed by an index\n",
+           token.c_str(), MemTypeStr);
+    exit(1);
+  }
+}
+
+void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
+                  ExeType &exeType, int& exeIndex)
+{
+  char typeChar;
+  if (sscanf(token.c_str(), " %c%d", &typeChar, &exeIndex) != 2)
+  {
+    printf("[ERROR] Unable to parse valid executor token (%s).  Exepected one of %s followed by an index\n",
+           token.c_str(), ExeTypeStr);
+    exit(1);
+  }
+  exeType = CharToExeType(typeChar);
+
+  if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= numCpus))
+  {
+    printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, exeIndex);
+    exit(1);
+  }
+  if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= numGpus))
+  {
+    printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, exeIndex);
+    exit(1);
+  }
+}
+
+// Helper function to parse a list of Transfer definitions
+void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
+{
+  // Replace any round brackets or '->' with spaces,
+  for (int i = 1; line[i]; i++)
+    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
+
+  transfers.clear();
+
+  int numTransfers = 0;
+  std::istringstream iss(line);
+  iss >> numTransfers;
+  if (iss.fail()) return;
+
+  std::string exeMem;
+  std::string srcMem;
+  std::string dstMem;
+
+  // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
+  // otherwise read triples (srcMem, exeMem, dstMem)
+  bool const advancedMode = (numTransfers < 0);
+  numTransfers = abs(numTransfers);
+
+  int numSubExecs;
+  if (!advancedMode)
+  {
+    iss >> numSubExecs;
+    if (numSubExecs <= 0 || iss.fail())
+    {
+      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs);
+      exit(1);
+    }
+  }
+
+  size_t numBytes = 0;
+  for (int i = 0; i < numTransfers; i++)
+  {
+    Transfer transfer;
+    transfer.transferIndex = i;
+    transfer.numBytes = 0;
+    transfer.numBytesActual = 0;
+    if (!advancedMode)
+    {
+      iss >> srcMem >> exeMem >> dstMem;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
+        exit(1);
+      }
+    }
+    else
+    {
+      std::string numBytesToken;
+      iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
+        exit(1);
+      }
+      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
+      {
+        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
+        exit(1);
+      }
+      char units = numBytesToken.back();
+      switch (toupper(units))
+      {
+      case 'K': numBytes *= 1024; break;
+      case 'M': numBytes *= 1024*1024; break;
+      case 'G': numBytes *= 1024*1024*1024; break;
+      }
+    }
+
+    ParseMemType(srcMem, numCpus, numGpus, transfer.srcType, transfer.srcIndex);
+    ParseMemType(dstMem, numCpus, numGpus, transfer.dstType, transfer.dstIndex);
+    ParseExeType(exeMem, numCpus, numGpus, transfer.exeType, transfer.exeIndex);
+
+    transfer.numSrcs = (int)transfer.srcType.size();
+    transfer.numDsts = (int)transfer.dstType.size();
+    if (transfer.numSrcs == 0 && transfer.numDsts == 0)
+    {
+      printf("[ERROR] Transfer must have at least one src or dst\n");
+      exit(1);
+    }
+
+    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
+    {
+      printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
+      exit(1);
+    }
+
+    transfer.numSubExecs = numSubExecs;
+    transfer.numBytes = numBytes;
+    transfers.push_back(transfer);
+  }
+}
+
+void EnablePeerAccess(int const deviceId, int const peerDeviceId)
+{
+  int canAccess;
+  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
+  if (!canAccess)
+  {
+    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
+    exit(1);
+  }
+  HIP_CALL(hipSetDevice(deviceId));
+  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
+  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
+  {
+    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
+           deviceId, peerDeviceId, hipGetErrorString(error));
+    exit(1);
+  }
+}
+
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
+{
+  if (numBytes == 0)
+  {
+    printf("[ERROR] Unable to allocate 0 bytes\n");
+    exit(1);
+  }
+  *memPtr = nullptr;
+  if (IsCpuType(memType))
+  {
+    // Set numa policy prior to call to hipHostMalloc
+    numa_set_preferred(devIndex);
+
+    // Allocate host-pinned memory (should respect NUMA mem policy)
+    if (memType == MEM_CPU_FINE)
+    {
+#if defined (__NVCC__)
+      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
+      exit(1);
+#else
+      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+#endif
+    }
+    else if (memType == MEM_CPU)
+    {
+#if defined (__NVCC__)
+      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
+#else
+      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
+#endif
+      {
+        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
+        exit(1);
+      }
+    }
+    else if (memType == MEM_CPU_UNPINNED)
+    {
+      *memPtr = numa_alloc_onnode(numBytes, devIndex);
+    }
+
+    // Check that the allocated pages are actually on the correct NUMA node
+    memset(*memPtr, 0, numBytes);
+    CheckPages((char*)*memPtr, numBytes, devIndex);
+
+    // Reset to default numa mem policy
+    numa_set_preferred(-1);
+  }
+  else if (IsGpuType(memType))
+  {
+    if (memType == MEM_GPU)
+    {
+      // Allocate GPU memory on appropriate device
+      HIP_CALL(hipSetDevice(devIndex));
+      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
+    }
+    else if (memType == MEM_GPU_FINE)
+    {
+#if defined (__NVCC__)
+      printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
+      exit(1);
+#else
+      HIP_CALL(hipSetDevice(devIndex));
+      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
+
+#endif
+    }
+    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
+    HIP_CALL(hipDeviceSynchronize());
+  }
+  else
+  {
+    printf("[ERROR] Unsupported memory type %d\n", memType);
+    exit(1);
+  }
+}
+
+void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
+{
+  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null CPU pointer for %lu bytes.  Skipping hipHostFree\n", bytes);
+      return;
+    }
+    HIP_CALL(hipHostFree(memPtr));
+  }
+  else if (memType == MEM_CPU_UNPINNED)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes.  Skipping numa_free\n", bytes);
+      return;
+    }
+    numa_free(memPtr, bytes);
+  }
+  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
+  {
+    if (memPtr == nullptr)
+    {
+      printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
+      return;
+    }
+    HIP_CALL(hipFree(memPtr));
+  }
+}
+
+void CheckPages(char* array, size_t numBytes, int targetId)
+{
+  unsigned long const pageSize = getpagesize();
+  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
+
+  std::vector<void *> pages(numPages);
+  std::vector<int> status(numPages);
+
+  pages[0] = array;
+  for (int i = 1; i < numPages; i++)
+  {
+    pages[i] = (char*)pages[i-1] + pageSize;
+  }
+
+  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+  if (retCode)
+  {
+    printf("[ERROR] Unable to collect page info\n");
+    exit(1);
+  }
+
+  size_t mistakeCount = 0;
+  for (int i = 0; i < numPages; i++)
+  {
+    if (status[i] < 0)
+    {
+      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
+      exit(1);
+    }
+    if (status[i] != targetId) mistakeCount++;
+  }
+  if (mistakeCount > 0)
+  {
+    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
+    exit(1);
+  }
+}
+
+void RunTransfer(EnvVars const& ev, int const iteration,
+                 ExecutorInfo& exeInfo, int const transferIdx)
+{
+  Transfer* transfer = exeInfo.transfers[transferIdx];
+
+  if (transfer->exeType == EXE_GPU_GFX)
+  {
+    // Switch to executing GPU
+    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
+    HIP_CALL(hipSetDevice(exeIndex));
+
+    hipStream_t& stream     = exeInfo.streams[transferIdx];
+    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
+
+    // Figure out how many threadblocks to use.
+    // In single stream mode, all the threadblocks for this GPU are launched
+    // Otherwise, just launch the threadblocks associated with this single Transfer
+    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
+#if defined(__NVCC__)
+    HIP_CALL(hipEventRecord(startEvent, stream));
+    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
+    HIP_CALL(hipEventRecord(stopEvent, stream));
+#else
+    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
+                          dim3(numBlocksToRun, 1, 1),
+                          dim3(BLOCKSIZE, 1, 1),
+                          ev.sharedMemBytes, stream,
+                          startEvent, stopEvent,
+                          0, transfer->subExecParamGpuPtr);
+#endif
+    // Synchronize per iteration, unless in single sync mode, in which case
+    // synchronize during last warmup / last actual iteration
+    HIP_CALL(hipStreamSynchronize(stream));
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      float gpuDeltaMsec;
+      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+
+      if (ev.useSingleStream)
+      {
+        // Figure out individual timings for Transfers that were all launched together
+        for (Transfer* currTransfer : exeInfo.transfers)
+        {
+          long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle;
+          long long maxStopCycle  = currTransfer->subExecParamGpuPtr[0].stopCycle;
+          for (int i = 1; i < currTransfer->numSubExecs; i++)
+          {
+            minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle);
+            maxStopCycle  = std::max(maxStopCycle,  currTransfer->subExecParamGpuPtr[i].stopCycle);
+          }
+          int const wallClockRate = GetWallClockRate(exeIndex);
+          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
+          currTransfer->transferTime += iterationTimeMs;
+        }
+        exeInfo.totalTime += gpuDeltaMsec;
+      }
+      else
+      {
+        transfer->transferTime += gpuDeltaMsec;
+      }
+    }
+  }
+  else if (transfer->exeType == EXE_GPU_DMA)
+  {
+    // Switch to executing GPU
+    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
+    HIP_CALL(hipSetDevice(exeIndex));
+
+    hipStream_t& stream     = exeInfo.streams[transferIdx];
+    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
+    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
+
+    HIP_CALL(hipEventRecord(startEvent, stream));
+    if (transfer->numSrcs == 0 && transfer->numDsts == 1)
+    {
+      HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
+                              MEMSET_CHAR, transfer->numBytesActual, stream));
+    }
+    else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
+    {
+      HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
+                              transfer->numBytesActual, hipMemcpyDefault,
+                              stream));
+    }
+    HIP_CALL(hipEventRecord(stopEvent, stream));
+    HIP_CALL(hipStreamSynchronize(stream));
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      float gpuDeltaMsec;
+      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+      transfer->transferTime += gpuDeltaMsec;
+    }
+  }
+  else if (transfer->exeType == EXE_CPU) // CPU execution agent
+  {
+    // Force this thread and all child threads onto correct NUMA node
+    int const exeIndex = RemappedIndex(transfer->exeIndex, true);
+    if (numa_run_on_node(exeIndex))
+    {
+      printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
+      exit(1);
+    }
+
+    std::vector<std::thread> childThreads;
+
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Launch each subExecutor in child-threads to perform memcopies
+    for (int i = 0; i < transfer->numSubExecs; ++i)
+      childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
+
+    // Wait for child-threads to finish
+    for (int i = 0; i < transfer->numSubExecs; ++i)
+      childThreads[i].join();
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+
+    // Record time if not a warmup iteration
+    if (iteration >= 0)
+      transfer->transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+  }
+}
+
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
+{
+  ev.DisplayP2PBenchmarkEnvVars();
+
+  // Collect the number of available CPUs/GPUs on this machine
+  int const numCpus    = ev.numCpuDevices;
+  int const numGpus    = ev.numGpuDevices;
+  int const numDevices = numCpus + numGpus;
+
+  // Enable peer to peer for each GPU
+  for (int i = 0; i < numGpus; i++)
+    for (int j = 0; j < numGpus; j++)
+      if (i != j) EnablePeerAccess(i, j);
+
+  // Perform unidirectional / bidirectional
+  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
+  {
+    // Print header
+    if (!ev.outputToCsv)
+    {
+      printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
+             ev.useRemoteRead ? "Remote" : "Local",
+             ev.useRemoteRead ? "Local" : "Remote",
+             ev.useDmaCopy    ? "DMA"   : "GFX");
+
+      if (isBidirectional)
+      {
+        printf("%12s", "SRC\\DST");
+      }
+      else
+      {
+        if (ev.useRemoteRead)
+          printf("%12s", "SRC\\EXE+DST");
+        else
+          printf("%12s", "SRC+EXE\\DST");
+      }
+      for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i);
+      for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i);
+      printf("\n");
+    }
+
+    // Loop over all possible src/dst pairs
+    for (int src = 0; src < numDevices; src++)
+    {
+      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
+      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
+
+      if (!ev.outputToCsv)
+        printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
+
+      for (int dst = 0; dst < numDevices; dst++)
+      {
+        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
+        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
+
+        double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex);
+        if (!ev.outputToCsv)
+        {
+          if (bandwidth == 0)
+            printf("%10s", "N/A");
+          else
+            printf("%10.2f", bandwidth);
+        }
+        else
+        {
+          printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n",
+                 srcType == MEM_CPU ? "CPU" : "GPU", srcIndex,
+                 dstType == MEM_CPU ? "CPU" : "GPU", dstIndex,
+                 isBidirectional ? "bidirectional" : "unidirectional",
+                 ev.useRemoteRead ? "Remote" : "Local",
+                 ev.useDmaCopy ? "DMA" : "GFX",
+                 bandwidth,
+                 N * sizeof(float));
+        }
+        fflush(stdout);
+      }
+      if (!ev.outputToCsv) printf("\n");
+    }
+    if (!ev.outputToCsv) printf("\n");
+  }
+}
+
+double GetPeakBandwidth(EnvVars const& ev, size_t const N,
+                        int     const  isBidirectional,
+                        MemType const  srcType, int const srcIndex,
+                        MemType const  dstType, int const dstIndex)
+{
+  // Skip bidirectional on same device
+  if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
+
+  // Prepare Transfers
+  std::vector<Transfer> transfers(2);
+  transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
+
+  // SRC -> DST
+  transfers[0].numSrcs = transfers[0].numDsts = 1;
+  transfers[0].srcType.push_back(srcType);
+  transfers[0].dstType.push_back(dstType);
+  transfers[0].srcIndex.push_back(srcIndex);
+  transfers[0].dstIndex.push_back(dstIndex);
+
+  // DST -> SRC
+  transfers[1].numSrcs = transfers[1].numDsts = 1;
+  transfers[1].srcType.push_back(dstType);
+  transfers[1].dstType.push_back(srcType);
+  transfers[1].srcIndex.push_back(dstIndex);
+  transfers[1].dstIndex.push_back(srcIndex);
+
+  // Either perform (local read + remote write), or (remote read + local write)
+  ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+  transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
+  transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
+  transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
+  transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
+  transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+  transfers[1].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+
+  // Remove (DST->SRC) if not bidirectional
+  transfers.resize(isBidirectional + 1);
+
+  // Abort if executing on NUMA node with no CPUs
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
+      return 0;
+
+#if defined(__NVCC__)
+    // NVIDIA platform cannot access GPU memory directly from CPU executors
+    if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
+        return 0;
+#endif
+  }
+
+  ExecuteTransfers(ev, 0, N, transfers, false);
+
+  // Collect aggregate bandwidth
+  double totalBandwidth = 0;
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations);
+    double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
+    totalBandwidth += transferBandwidthGbs;
+  }
+  return totalBandwidth;
+}
+
+void Transfer::PrepareSubExecParams(EnvVars const& ev)
+{
+  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
+  // Figure out the sub-array each subExecutor works on for this Transfer
+  // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
+  //   except the very last one, for alignment reasons
+  size_t const N              = this->numBytesActual / sizeof(float);
+  int    const initOffset     = ev.byteOffset / sizeof(float);
+  int    const targetMultiple = ev.blockBytes / sizeof(float);
+
+  // In some cases, there may not be enough data for all subExectors
+  int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
+  this->subExecParam.clear();
+  this->subExecParam.resize(this->numSubExecs);
+
+  size_t assigned = 0;
+  for (int i = 0; i < this->numSubExecs; ++i)
+  {
+    int    const subExecLeft = std::max(0, maxSubExecToUse - i);
+    size_t const leftover    = N - assigned;
+    size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
+
+    SubExecParam& p = this->subExecParam[i];
+    p.N             = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
+    p.numSrcs       = this->numSrcs;
+    p.numDsts       = this->numDsts;
+    for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc)
+      p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset;
+    for (int iDst = 0; iDst < this->numDsts; ++iDst)
+      p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
+
+    if (ev.enableDebug)
+    {
+      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
+             this->transferIndex, i, p.N, assigned, assigned + p.N);
+    }
+
+    p.startCycle = 0;
+    p.stopCycle  = 0;
+    assigned += p.N;
+  }
+
+  this->transferTime = 0.0;
+}
+
+void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
+{
+  size_t N = buffer.size();
+  if (bufferIdx >= 0)
+  {
+    size_t patternLen = ev.fillPattern.size();
+    if (patternLen > 0)
+    {
+      for (size_t i = 0; i < N; ++i)
+        buffer[i] = ev.fillPattern[i % patternLen];
+    }
+    else
+    {
+      for (size_t i = 0; i < N; ++i)
+        buffer[i] = PrepSrcValue(bufferIdx, i);
+    }
+  }
+  else // Destination buffer
+  {
+    if (this->numSrcs == 0)
+    {
+      // Note: 0x75757575 = 13323083.0
+      memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
+    }
+    else
+    {
+      PrepareReference(ev, buffer, 0);
+
+      if (this->numSrcs > 1)
+      {
+        std::vector<float> temp(N);
+        for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
+        {
+          PrepareReference(ev, temp, srcIdx);
+          for (int i = 0; i < N; ++i)
+          {
+            buffer[i] += temp[i];
+          }
+        }
+      }
+    }
+  }
+}
+
+bool Transfer::PrepareSrc(EnvVars const& ev)
+{
+  if (this->numSrcs == 0) return true;
+  size_t const N = this->numBytesActual / sizeof(float);
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  std::vector<float> reference(N);
+  for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
+  {
+    float* srcPtr = this->srcMem[srcIdx] + initOffset;
+    PrepareReference(ev, reference, srcIdx);
+
+    // Initialize source memory array with reference pattern
+    if (IsGpuType(this->srcType[srcIdx]))
+    {
+      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
+      HIP_CALL(hipSetDevice(deviceIdx));
+      if (ev.usePrepSrcKernel)
+        PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx);
+      else
+        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
+      HIP_CALL(hipDeviceSynchronize());
+    }
+    else if (IsCpuType(this->srcType[srcIdx]))
+    {
+      memcpy(srcPtr, reference.data(), this->numBytesActual);
+    }
+
+    // Perform check just to make sure that data has been copied properly
+    float* srcCheckPtr = srcPtr;
+    std::vector<float> srcCopy(N);
+    if (IsGpuType(this->srcType[srcIdx]))
+    {
+      if (!ev.validateDirect)
+      {
+        HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
+        HIP_CALL(hipDeviceSynchronize());
+        srcCheckPtr = srcCopy.data();
+      }
+    }
+
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (reference[i] != srcCheckPtr[i])
+      {
+        printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
+#if !defined(__NVCC__)
+        float const val = this->srcMem[srcIdx][initOffset + i];
+        printf("[ERROR] SRC %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+               srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
+#else
+        printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
+#endif
+        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
+        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
+               this->transferIndex,
+               this->SrcToStr().c_str(),
+               ExeTypeStr[this->exeType], this->exeIndex,
+               this->numSubExecs,
+               this->DstToStr().c_str());
+        if (!ev.continueOnError)
+          exit(1);
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void Transfer::ValidateDst(EnvVars const& ev)
+{
+  if (this->numDsts == 0) return;
+  size_t const N = this->numBytesActual / sizeof(float);
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  std::vector<float> reference(N);
+  PrepareReference(ev, reference, -1);
+
+  std::vector<float> hostBuffer(N);
+  for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
+  {
+    float* output;
+    if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
+    {
+      output = this->dstMem[dstIdx] + initOffset;
+    }
+    else
+    {
+      int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
+      HIP_CALL(hipSetDevice(deviceIdx));
+      HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
+      HIP_CALL(hipDeviceSynchronize());
+      output = hostBuffer.data();
+    }
+
+    for (size_t i = 0; i < N; ++i)
+    {
+      if (reference[i] != output[i])
+      {
+        printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
+        for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
+        {
+          float srcVal;
+          HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
+#if !defined(__NVCC__)
+          float val = this->srcMem[srcIdx][initOffset + i];
+          printf("[ERROR] SRC %02dD  value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+                 srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
+#else
+          printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
+#endif
+        }
+        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
+#if !defined(__NVCC__)
+        float dstVal = this->dstMem[dstIdx][initOffset + i];
+        printf("[ERROR] DST %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
+               dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
+#else
+        printf("[ERROR] DST %02d   value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
+#endif
+        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
+               this->transferIndex,
+               this->SrcToStr().c_str(),
+               ExeTypeStr[this->exeType], this->exeIndex,
+               this->numSubExecs,
+               this->DstToStr().c_str());
+        if (!ev.continueOnError)
+          exit(1);
+        else
+          break;
+      }
+    }
+  }
+}
+
+std::string Transfer::SrcToStr() const
+{
+  if (numSrcs == 0) return "N";
+  std::stringstream ss;
+  for (int i = 0; i < numSrcs; ++i)
+    ss << MemTypeStr[srcType[i]] << srcIndex[i];
+  return ss.str();
+}
+
+std::string Transfer::DstToStr() const
+{
+  if (numDsts == 0) return "N";
+  std::stringstream ss;
+  for (int i = 0; i < numDsts; ++i)
+    ss << MemTypeStr[dstType[i]] << dstIndex[i];
+  return ss.str();
+}
+
+// NOTE: This is a stop-gap solution until HIP provides wallclock values
+int GetWallClockRate(int deviceId)
+{
+  static std::vector<int> wallClockPerDeviceMhz;
+
+  if (wallClockPerDeviceMhz.size() == 0)
+  {
+    int numGpuDevices;
+    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+    wallClockPerDeviceMhz.resize(numGpuDevices);
+
+    for (int i = 0; i < numGpuDevices; i++)
+    {
+#if defined(__NVCC__)
+      int value = 1410000;
+      //HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
+      //value *= 1000;
+#else
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, i));
+      int value = 25000;
+      switch (prop.gcnArch)
+      {
+      case 906: case 910: value = 25000; break;
+      default:
+        printf("Unrecognized GCN arch %d\n", prop.gcnArch);
+      }
+#endif
+      wallClockPerDeviceMhz[i] = value;
+    }
+  }
+  return wallClockPerDeviceMhz[deviceId];
+}
+
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
+{
+  ev.DisplaySweepEnvVars();
+
+  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
+  std::vector<std::pair<ExeType, int>> exeList;
+  for (auto exe : ev.sweepExe)
+  {
+    ExeType const exeType = CharToExeType(exe);
+    if (IsGpuType(exeType))
+    {
+      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
+        exeList.push_back(std::make_pair(exeType, exeIndex));
+    }
+    else if (IsCpuType(exeType))
+    {
+      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
+      {
+        // Skip NUMA nodes that have no CPUs (e.g. CXL)
+        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
+        exeList.push_back(std::make_pair(exeType, exeIndex));
+      }
+    }
+  }
+  int numExes = exeList.size();
+
+  std::vector<std::pair<MemType, int>> srcList;
+  for (auto src : ev.sweepSrc)
+  {
+    MemType const srcType = CharToMemType(src);
+    int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
+
+    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
+      srcList.push_back(std::make_pair(srcType, srcIndex));
+  }
+  int numSrcs = srcList.size();
+
+
+  std::vector<std::pair<MemType, int>> dstList;
+  for (auto dst : ev.sweepDst)
+  {
+    MemType const dstType = CharToMemType(dst);
+    int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
+
+    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
+      dstList.push_back(std::make_pair(dstType, dstIndex));
+  }
+  int numDsts = dstList.size();
+
+  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
+  struct TransferInfo
+  {
+    MemType srcType; int srcIndex;
+    ExeType exeType; int exeIndex;
+    MemType dstType; int dstIndex;
+  };
+
+  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
+  bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
+
+  std::vector<TransferInfo> possibleTransfers;
+  TransferInfo tinfo;
+  for (int i = 0; i < numExes; ++i)
+  {
+    // Skip CPU executors if XGMI link must be used
+    if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
+    tinfo.exeType  = exeList[i].first;
+    tinfo.exeIndex = exeList[i].second;
+
+    bool isXgmiSrc  = false;
+    int  numHopsSrc = 0;
+    for (int j = 0; j < numSrcs; ++j)
+    {
+      if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
+      {
+        if (exeList[i].second != srcList[j].second)
+        {
+#if defined(__NVCC__)
+          isXgmiSrc = false;
+#else
+          uint32_t exeToSrcLinkType, exeToSrcHopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
+                                                RemappedIndex(srcList[j].second, false),
+                                                &exeToSrcLinkType,
+                                                &exeToSrcHopCount));
+          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
+#endif
+        }
+        else
+        {
+          isXgmiSrc = true;
+          numHopsSrc = 0;
+        }
+
+        // Skip this SRC if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiSrc) continue;
+
+        // Skip this SRC if XGMI distance is already past limit
+        if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
+      }
+      else if (useXgmiOnly) continue;
+
+      tinfo.srcType  = srcList[j].first;
+      tinfo.srcIndex = srcList[j].second;
+
+      bool isXgmiDst = false;
+      int  numHopsDst = 0;
+      for (int k = 0; k < numDsts; ++k)
+      {
+        if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
+        {
+          if (exeList[i].second != dstList[k].second)
+          {
+#if defined(__NVCC__)
+            isXgmiSrc = false;
+#else
+            uint32_t exeToDstLinkType, exeToDstHopCount;
+            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
+                                                  RemappedIndex(dstList[k].second, false),
+                                                  &exeToDstLinkType,
+                                                  &exeToDstHopCount));
+            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+            if (isXgmiDst) numHopsDst = exeToDstHopCount;
+#endif
+          }
+          else
+          {
+            isXgmiDst = true;
+            numHopsDst = 0;
+          }
+        }
+
+        // Skip this DST if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiDst) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
+        if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
+        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
+
+#if defined(__NVCC__)
+        // Skip CPU executors on GPU memory on NVIDIA platform
+        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
+          continue;
+#endif
+
+        tinfo.dstType  = dstList[k].first;
+        tinfo.dstIndex = dstList[k].second;
+
+        possibleTransfers.push_back(tinfo);
+      }
+    }
+  }
+
+  int const numPossible = (int)possibleTransfers.size();
+  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
+
+  if (ev.sweepMin > numPossible)
+  {
+    printf("No valid test configurations exist\n");
+    return;
+  }
+
+  if (ev.outputToCsv)
+  {
+    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
+           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
+  }
+
+  int numTestsRun = 0;
+  int M = ev.sweepMin;
+  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
+  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
+
+  // Log sweep to configuration file
+  FILE *fp = fopen("lastSweep.cfg", "w");
+  if (!fp)
+  {
+    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
+    exit(1);
+  }
+
+  // Create bitmask of numPossible triplets, of which M will be chosen
+  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
+  auto cpuStart = std::chrono::high_resolution_clock::now();
+  while (1)
+  {
+    if (isRandom)
+    {
+      // Pick random number of simultaneous transfers to execute
+      // NOTE: This currently skews distribution due to some #s having more possibilities than others
+      M = distribution(*ev.generator);
+
+      // Generate a random bitmask
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+      std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
+    }
+
+    // Convert bitmask to list of Transfers
+    std::vector<Transfer> transfers;
+    for (int value = 0; value < numPossible; ++value)
+    {
+      if (bitmask[value])
+      {
+        // Convert integer value to (SRC->EXE->DST) triplet
+        Transfer transfer;
+        transfer.numSrcs        = 1;
+        transfer.numDsts        = 1;
+        transfer.srcType        = {possibleTransfers[value].srcType};
+        transfer.srcIndex       = {possibleTransfers[value].srcIndex};
+        transfer.exeType        = possibleTransfers[value].exeType;
+        transfer.exeIndex       = possibleTransfers[value].exeIndex;
+        transfer.dstType        = {possibleTransfers[value].dstType};
+        transfer.dstIndex       = {possibleTransfers[value].dstIndex};
+        transfer.numSubExecs    = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
+        transfer.transferIndex  = transfers.size();
+        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
+        transfers.push_back(transfer);
+      }
+    }
+
+    LogTransfers(fp, ++numTestsRun, transfers);
+    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
+
+    // Check for test limit
+    if (numTestsRun == ev.sweepTestLimit)
+    {
+      printf("Test limit reached\n");
+      break;
+    }
+
+    // Check for time limit
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
+    {
+      printf("Time limit exceeded\n");
+      break;
+    }
+
+    // Increment bitmask if not random sweep
+    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
+    {
+      M++;
+      // Check for completion
+      if (M > maxParallelTransfers)
+      {
+        printf("Sweep complete\n");
+        break;
+      }
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+    }
+  }
+  fclose(fp);
+}
+
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
+{
+  fprintf(fp, "# Test %d\n", testNum);
+  fprintf(fp, "%d", -1 * (int)transfers.size());
+  for (auto const& transfer : transfers)
+  {
+    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
+            MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
+            ExeTypeStr[transfer.exeType],    transfer.exeIndex,
+            MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
+            transfer.numSubExecs,
+            transfer.numBytes);
+  }
+  fprintf(fp, "\n");
+  fflush(fp);
+}
+
+std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
+{
+  std::stringstream ss;
+  for (int i = 0; i < strVector.size(); ++i)
+  {
+    if (i) ss << " ";
+    ss << (strVector[i] + initOffset);
+  }
+  return ss.str();
+}
diff --git a/tfb.so/src/action.cpp b/tfb.so/src/action.cpp
new file mode 100644
index 00000000..0d0d4931
--- /dev/null
+++ b/tfb.so/src/action.cpp
@@ -0,0 +1,204 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include/action.h"
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <regex>
+#include <utility>
+#include <algorithm>
+#include <map>
+
+#define __HIP_PLATFORM_HCC__
+
+#include "include/rvs_key_def.h"
+#include "include/tfb_worker.h"
+#include "include/rvsactionbase.h"
+#include "include/rvsloglp.h"
+
+using std::string;
+using std::vector;
+using std::map;
+using std::regex;
+
+#define RVS_CONF_TFB_TYPE_KEY       "transfer-type"
+#define MODULE_NAME                 "transfer-bench"
+#define MODULE_NAME_CAPS            "TRANSFER-BENCH"
+#define TFB_DEFAULT_TYPE            "p2p"
+#define TFB_NO_COMPATIBLE_GPUS          "No AMD compatible GPU found!"
+
+#define FLOATING_POINT_REGEX            "^[0-9]*\\.?[0-9]+$"
+
+#define JSON_CREATE_NODE_ERROR          "JSON cannot create node"
+
+/**
+ * @brief default class constructor
+ */
+tfb_action::tfb_action() {
+    bjson = false;
+}
+
+/**
+ * @brief class destructor
+ */
+tfb_action::~tfb_action() {
+    property.clear();
+}
+
+/**
+ * @brief runs the hip test session
+ * @return true if no error occured, false otherwise
+ */
+bool tfb_action::start_tfb_runners() {
+    size_t k = 0;
+    // one worker sufficient, as test runner
+    tfbWorker worker;
+    worker.set_name(action_name);
+    worker.set_transfer_type(m_transfer_type);
+    worker.start();
+    worker.join();
+
+    return rvs::lp::Stopping() ? false : true;
+}
+
+/**
+ * @brief reads all GST-related configuration keys from
+ * the module's properties collection
+ * @return true if no fatal error occured, false otherwise
+ */
+bool tfb_action::get_all_tfb_config_keys(void) {
+    int error;
+    string msg;
+    bool bsts = true;
+
+    if (property_get<std::string>(RVS_CONF_TFB_TYPE_KEY, &m_transfer_type,
+            TFB_DEFAULT_TYPE)) {
+         msg = "invalid '" +
+         std::string(RVS_CONF_TFB_TYPE_KEY) + "' key value";
+         rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
+         bsts = false;
+    }
+
+    return bsts;
+}
+
+/**
+ * @brief reads all common configuration keys from
+ * the module's properties collection
+ * @return true if no fatal error occured, false otherwise
+ */
+bool tfb_action::get_all_common_config_keys(void) {
+    string msg, sdevid, sdev;
+    int error;
+    bool bsts = true;
+
+
+    // place holder for later
+
+    return bsts;
+}
+
+/**
+ * @brief gets the number of ROCm compatible AMD GPUs
+ * @return run number of GPUs
+ */
+int tfb_action::get_num_amd_gpu_devices(void) {
+    int hip_num_gpu_devices;
+    string msg;
+
+    hipGetDeviceCount(&hip_num_gpu_devices);
+    if (hip_num_gpu_devices == 0) {  // no AMD compatible GPU
+        msg = action_name + " " + MODULE_NAME + " " + TFB_NO_COMPATIBLE_GPUS;
+        rvs::lp::Log(msg, rvs::logerror);
+
+        if (bjson) {
+            unsigned int sec;
+            unsigned int usec;
+            rvs::lp::get_ticks(&sec, &usec);
+            void *json_root_node = rvs::lp::LogRecordCreate(MODULE_NAME,
+                            action_name.c_str(), rvs::loginfo, sec, usec, true);
+            if (!json_root_node) {
+                // log the error
+                string msg = std::string(JSON_CREATE_NODE_ERROR);
+                rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
+                return -1;
+            }
+
+            rvs::lp::AddString(json_root_node, "ERROR", TFB_NO_COMPATIBLE_GPUS);
+            rvs::lp::LogRecordFlush(json_root_node, rvs::loginfo);
+        }
+        return 0;
+    }
+    return hip_num_gpu_devices;
+}
+
+/**
+ * @brief gets all selected GPUs and starts the worker threads
+ * @return run result
+ */
+int tfb_action::run_transferbench(void) {
+    int hip_num_gpu_devices;
+    std::string msg;
+
+    hip_num_gpu_devices = get_num_amd_gpu_devices();
+     
+    if (hip_num_gpu_devices > 0) {
+        if (start_tfb_runners())
+            return 0;
+
+        return -1;
+    } else {
+      msg = "No devices match criteria from the test configuation.";
+      rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name);
+      return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * @brief runs the whole GST logic
+ * @return run result
+ */
+int tfb_action::run(void) {
+    string msg;
+
+    // get the action name
+    if (property_get(RVS_CONF_NAME_KEY, &action_name)) {
+      rvs::lp::Err("Action name missing", MODULE_NAME_CAPS);
+      return -1;
+    }
+
+    // check for -j flag (json logging)
+    if (property.find("cli.-j") != property.end())
+        bjson = true;
+    if (!get_all_common_config_keys())
+        return -1;
+    if (!get_all_tfb_config_keys())
+        return -1;
+
+    return run_transferbench();
+}
diff --git a/tfb.so/src/hiptest_worker.cpp b/tfb.so/src/hiptest_worker.cpp
new file mode 100644
index 00000000..9a31fc32
--- /dev/null
+++ b/tfb.so/src/hiptest_worker.cpp
@@ -0,0 +1,96 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include/hiptest_worker.h"
+
+#include <unistd.h>
+#include <string>
+#include <memory>
+#include <iostream>
+#include "include/rvs_blas.h"
+#include "include/rvs_module.h"
+#include "include/rvsloglp.h"
+#include "include/rvs_util.h"
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+
+#define MODULE_NAME                             "hiptest"
+
+using std::string;
+
+//bool hipTestWorker::bjson = false;
+
+hipTestWorker::hipTestWorker() {}
+hipTestWorker::~hipTestWorker() {}
+
+
+/**
+ * @brief performs the stress test on the given GPU
+ */
+void hipTestWorker::run() {
+    string msg, err_description;
+    int error = 0;
+
+
+    // log GST stress test - start message
+    msg = "[" + action_name + "] " + MODULE_NAME + " " +
+            " Starting the Hip test "; 
+    rvs::lp::Log(msg, rvs::logtrace);
+
+    // let the GPU ramp-up and check the result
+    bool hipsuccess = start_hip_tests(error, err_description);
+
+    // GPU was not able to do the processing (HIP/rocBlas error(s) occurred)
+    if (error) {
+        string msg = "[" + action_name + "] " + MODULE_NAME + " "
+                         + err_description;
+        rvs::lp::Log(msg, rvs::logerror);
+        return;
+    }
+
+}
+
+/**
+ * @brief forks and execs test result
+ * @param return true if test succeeded, false otherwise
+ */
+
+bool hipTestWorker::start_hip_tests(int &error, string &errdesc){
+    int pid, status;
+    auto found = m_test_path.find_last_of('/');
+    auto fname = m_test_path.substr(found+1);
+    if((pid = fork()) == 0){ // child
+	execl(m_test_path.c_str(), fname.c_str(), m_test_args.c_str(), 0);
+    }else{
+	waitpid(pid, &status, 0);
+	if (WIFEXITED(status)){
+	    error = 0;
+	    return true;
+	}
+	return false;
+    }
+}
diff --git a/tfb.so/src/rvs_module.cpp b/tfb.so/src/rvs_module.cpp
new file mode 100644
index 00000000..f96a16f1
--- /dev/null
+++ b/tfb.so/src/rvs_module.cpp
@@ -0,0 +1,91 @@
+/********************************************************************************
+ *
+ * Copyright (c) 2018 ROCm Developer Tools
+ *
+ * MIT LICENSE:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "include/rvs_module.h"
+#include "include/action.h"
+#include "include/rvsloglp.h"
+#include "include/gpu_util.h"
+
+/**
+ * @defgroup GST GST Module
+ *
+ * @brief performs GPU Stress Test
+ *
+ * The GPU Stress Test runs a Graphics Stress test or SGEMM/DGEMM
+ * (Single/Double-precision General Matrix Multiplication) workload
+ * on one, some or all GPUs. The GPUs can be of the same or different types.
+ * The duration of the benchmark should be configurable, both in terms of time
+ * (how long to run) and iterations (how many times to run).
+ * 
+ */
+
+extern "C" int rvs_module_has_interface(int iid) {
+  int sts = 0;
+  switch (iid) {
+  case 0:
+  case 1:
+    sts = 1;
+  }
+  return sts;
+}
+
+extern "C" const char* rvs_module_get_description(void) {
+    return "ROCm Validation Suite HipTest module";
+}
+
+extern "C" const char* rvs_module_get_config(void) {
+    return "test_path (string)" ;
+}
+
+extern "C" const char* rvs_module_get_output(void) {
+    return "pass (bool)";
+}
+
+extern "C" int rvs_module_init(void* pMi) {
+    rvs::lp::Initialize(static_cast<T_MODULE_INIT*>(pMi));
+    rvs::gpulist::Initialize();
+    return 0;
+}
+
+extern "C" int rvs_module_terminate(void) {
+    return 0;
+}
+
+extern "C" void* rvs_module_action_create(void) {
+    return static_cast<void*>(new tfb_action);
+}
+
+extern "C" int   rvs_module_action_destroy(void* pAction) {
+    delete static_cast<rvs::actionbase*>(pAction);
+    return 0;
+}
+
+extern "C" int rvs_module_action_property_set(void* pAction, const char* Key,
+                                                            const char* Val) {
+    return static_cast<rvs::actionbase*>(pAction)->property_set(Key, Val);
+}
+
+extern "C" int rvs_module_action_run(void* pAction) {
+    return static_cast<rvs::actionbase*>(pAction)->run();
+}
diff --git a/tfb.so/src/tfb_worker.cpp b/tfb.so/src/tfb_worker.cpp
new file mode 100644
index 00000000..5ec11185
--- /dev/null
+++ b/tfb.so/src/tfb_worker.cpp
@@ -0,0 +1,104 @@
+/*
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// This program measures simultaneous copy performance across multiple GPUs
+// on the same node
+#include <numa.h>
+#include <numaif.h>
+#include <random>
+#include <stack>
+#include <thread>
+#include "include/tfb_worker.h"
+#include "include/rvsloglp.h"
+#include "include/TransferBench.hpp"
+#include "include/GetClosestNumaNode.hpp"
+
+static const std::string MODULE_NAME{"transfer-bench"};
+/**
+ * @brief performs the stress test on the given GPU
+ */
+void tfbWorker::run() {
+	std::string msg, err_description;
+    int error = 0;
+
+
+    // log GST stress test - start message
+    msg = "[" + action_name + "] " + MODULE_NAME + " " +
+            " Starting the Hip test ";
+    rvs::lp::Log(msg, rvs::logtrace);
+
+    // let the GPU ramp-up and check the result
+   int sts = TfbRun(error, err_description);
+
+    // GPU was not able to do the processing (HIP/rocBlas error(s) occurred)
+    if (error) {
+	    std::string msg = "[" + action_name + "] " + MODULE_NAME + " "
+                         + err_description;
+        rvs::lp::Log(msg, rvs::logerror);
+        return;
+    }
+
+}
+
+
+int tfbWorker::TfbRun(int &err, std::string &errmsg)
+{
+  // Check for NUMA library support
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+
+
+  // Collect environment variables / display current run configuration
+  EnvVars ev;
+
+  // Determine number of bytes to run per Transfer
+  //size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
+  size_t numBytesPerTransfer = DEFAULT_BYTES_PER_TRANSFER*1024*1024;
+  /*
+  if (argc > 2)
+  {
+    // Adjust bytes if unit specified
+    char units = argv[2][strlen(argv[2])-1];
+    switch (units)
+    {
+    case 'K': case 'k': numBytesPerTransfer *= 1024; break;
+    case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
+    case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
+    }
+  }*/
+  if (numBytesPerTransfer % 4)
+  {
+    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
+    exit(1);
+  }
+
+    ev.configMode = CFG_P2P;
+    RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float));
+
+
+  return 0;
+}
+
+
diff --git a/tfb.so/tests.cmake b/tfb.so/tests.cmake
new file mode 100644
index 00000000..c1b6b74e
--- /dev/null
+++ b/tfb.so/tests.cmake
@@ -0,0 +1,27 @@
+################################################################################
+##
+## Copyright (c) 2018 ROCm Developer Tools
+##
+## MIT LICENSE:
+## Permission is hereby granted, free of charge, to any person obtaining a copy of
+## this software and associated documentation files (the "Software"), to deal in
+## the Software without restriction, including without limitation the rights to
+## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+## of the Software, and to permit persons to whom the Software is furnished to do
+## so, subject to the following conditions:
+##
+## The above copyright notice and this permission notice shall be included in all
+## copies or substantial portions of the Software.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+## SOFTWARE.
+##
+################################################################################
+
+
+include(tests_conf_logging)
diff --git a/tfb.so/transferbench/CMakeLists.txt b/tfb.so/transferbench/CMakeLists.txt
new file mode 100644
index 00000000..18df727b
--- /dev/null
+++ b/tfb.so/transferbench/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+if (DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
+else()
+    set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory")
+endif()
+cmake_minimum_required(VERSION 3.5)
+
+project(TransferBench VERSION 1.0 LANGUAGES CXX)
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
+include_directories(${ROCM_PATH}/include)
+link_libraries(numa hsa-runtime64 pthread)
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..)
+add_executable(TransferBench src/TransferBench.cpp)
+target_include_directories(TransferBench PRIVATE src/include)
+
+find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
+include(ROCMInstallTargets)
+include(ROCMCreatePackage)
+set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF)
+
+set(PACKAGE_NAME TB)
+set(LIBRARY_NAME TransferBench)
+
+rocm_install(TARGETS TransferBench)
+
+rocm_create_package(
+    NAME ${LIBRARY_NAME}
+    DESCRIPTION "TransferBench package"
+    MAINTAINER "RCCL Team <gilbert.lee@amd.com>"
+)