diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a750b23..0ddd669f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -474,7 +474,7 @@ add_subdirectory(rcqt.so) add_subdirectory(smqt.so) add_subdirectory(mem.so) add_subdirectory(babel.so) -#add_subdirectory(edp.so) +add_subdirectory(tfb.so) add_subdirectory(perf.so) if (RVS_BUILD_TESTS) diff --git a/rvs/.rvsmodules.config b/rvs/.rvsmodules.config index 047d0cac..af55776c 100644 --- a/rvs/.rvsmodules.config +++ b/rvs/.rvsmodules.config @@ -12,3 +12,4 @@ iet: libiet.so mem: libmem.so babel: libbabel.so perf: libperf.so +tfb: libtfb.so diff --git a/rvs/conf/tfb_single.conf b/rvs/conf/tfb_single.conf new file mode 100644 index 00000000..4d7324c6 --- /dev/null +++ b/rvs/conf/tfb_single.conf @@ -0,0 +1,37 @@ +# ################################################################################ +# # +# # Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +# # +# # MIT LICENSE: +# # Permission is hereby granted, free of charge, to any person obtaining a copy of +# # this software and associated documentation files (the "Software"), to deal in +# # the Software without restriction, including without limitation the rights to +# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# # of the Software, and to permit persons to whom the Software is furnished to do +# # so, subject to the following conditions: +# # +# # The above copyright notice and this permission notice shall be included in all +# # copies or substantial portions of the Software. +# # +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# # SOFTWARE. +# # +# ############################################################################### + + +# Hip test +# +# Preconditions: +# sudo ./rvs -c conf/tfb_single.conf -d 3 +# +# Expected result: +# run transfer bench tests +actions: +- name: p2p-benchmarks + module: tfb + transfer-type: p2p diff --git a/rvs/src/rvsmodule.cpp b/rvs/src/rvsmodule.cpp index 0aebb1fe..cdf0eec6 100644 --- a/rvs/src/rvsmodule.cpp +++ b/rvs/src/rvsmodule.cpp @@ -168,6 +168,7 @@ rvs::module* rvs::module::find_create_module(const char* name) { libpath += "../lib/rvs/"; } string sofullname(libpath + it->second); + std::cout << "MANOJ:::: " << sofullname << std::endl; void* psolib = dlopen(sofullname.c_str(), RTLD_NOW); // error? if (!psolib) { @@ -227,7 +228,8 @@ rvs::module* rvs::module::find_create_module(const char* name) { } // add to map - modulemap.insert(t_mmpair(name, m)); + std::cout << "module name manoj is :" << name; + } else { m = it->second; } diff --git a/tfb.so/CMakeLists.txt b/tfb.so/CMakeLists.txt new file mode 100644 index 00000000..ee2002d9 --- /dev/null +++ b/tfb.so/CMakeLists.txt @@ -0,0 +1,183 @@ +################################################################################ +## +## Copyright (c) 2018 ROCm Developer Tools +## +## MIT LICENSE: +## Permission is hereby granted, free of charge, to any person obtaining a copy of +## this software and associated documentation files (the "Software"), to deal in +## the Software without restriction, including without limitation the rights to +## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +## of the Software, and to permit persons to whom the Software is furnished to do +## so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +## SOFTWARE. +## +################################################################################ + +cmake_minimum_required ( VERSION 3.5.0 ) +if ( ${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) + message(FATAL "In-source build is not allowed") +endif () +set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") + +set ( RVS "tfb" ) +set ( RVS_PACKAGE "rvs-roct" ) +set ( RVS_COMPONENT "lib${RVS}" ) +set ( RVS_TARGET "${RVS}" ) + +project ( ${RVS_TARGET} ) + +message(STATUS "MODULE: ${RVS}") +add_compile_options(-std=c++11) +add_compile_options(-Wall ) +if (RVS_COVERAGE) + add_compile_options(-o0 -fprofile-arcs -ftest-coverage) + set(CMAKE_EXE_LINKER_FLAGS "--coverage") + set(CMAKE_SHARED_LINKER_FLAGS "--coverage") +endif() + +## Set default module path if not already set +if ( NOT DEFINED CMAKE_MODULE_PATH ) + set ( CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake_modules/" ) +endif () + +## Include common cmake modules +include ( utils ) + +## Setup the package version. +get_version ( "0.0.0" ) + +set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) +set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) +set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) + +if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) + set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) +endif () +set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) + +## make version numbers visible to C code +add_compile_options(-DBUILD_VERSION_MAJOR=${VERSION_MAJOR}) +add_compile_options(-DBUILD_VERSION_MINOR=${VERSION_MINOR}) +add_compile_options(-DBUILD_VERSION_PATCH=${VERSION_PATCH}) +add_compile_options(-DLIB_VERSION_STRING="${LIB_VERSION_STRING}") +add_compile_options(-DBUILD_VERSION_STRING="${BUILD_VERSION_STRING}") + +set(ROCBLAS_LIB "rocblas") +set(HIP_HCC_LIB "amdhip64") + +# Determine HSA_PATH +if(NOT DEFINED HIPCC_PATH) + if(NOT DEFINED ENV{HIPCC_PATH}) + set(HIPCC_PATH "${ROCM_PATH}/hip" CACHE PATH "Path to which hipcc runtime has been installed") + else() + set(HIPCC_PATH $ENV{HIPCC_PATH} CACHE PATH "Path to which hipcc runtime has been installed") + endif() +endif() + +# Determine HSA_PATH +if(NOT DEFINED HSA_PATH) + if(NOT DEFINED ENV{HSA_PATH}) + set(HSA_PATH "${ROCM_PATH}/hsa" CACHE PATH "Path to which HSA runtime has been installed") + else() + set(HSA_PATH $ENV{HSA_PATH} CACHE PATH "Path to which HSA runtime has been installed") + endif() +endif() + +# Add HIP_VERSION to CMAKE__FLAGS +set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}") + +set(HIP_HCC_BUILD_FLAGS) +set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include ${ASAN_CXX_FLAGS}") + +# Set compiler and compiler flags +set(CMAKE_CXX_COMPILER "${HIPCC_PATH}/bin/hipcc") +set(CMAKE_C_COMPILER "${HIPCC_PATH}/bin/hipcc") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS} -L${ROCM_PATH}/lib") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LD_FLAGS}") +set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LD_FLAGS}") + +if(BUILD_ADDRESS_SANITIZER) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.asan-x86_64.so + OUTPUT_VARIABLE ASAN_LIB_FULL_PATH) + get_filename_component(ASAN_LIB_PATH ${ASAN_LIB_FULL_PATH} DIRECTORY) +endif() + +# Determine Roc Runtime header files are accessible +if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h) + message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR}) + RETURN() +endif() + +if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h) + message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR}) + RETURN() +endif() + +# Determine Roc Runtime header files are accessible +if(DEFINED RVS_ROCMSMI) + if(NOT RVS_ROCMSMI EQUAL 1) + if(NOT EXISTS ${ROCBLAS_INC_DIR}/rocblas.h) + message("ERROR: rocBLAS headers can't be found under specified path. Please set ROCBLAS_INC_DIR path. Current value is : " ${ROCBLAS_INC_DIR}) + RETURN() + endif() + + if(NOT EXISTS "${ROCBLAS_LIB_DIR}/lib${ROCBLAS_LIB}.so") + message("ERROR: rocBLAS library can't be found under specified path. Please set ROCBLAS_LIB_DIR path. Current value is : " ${ROCBLAS_LIB_DIR}) + RETURN() + endif() + endif() +endif() + + +if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() +endif() + +## define include directories +include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${ROCM_PATH}/include) +# Add directories to look for library files to link +link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) +## additional libraries +set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so numa hsa-runtime64 ) + +## define source files +set(SOURCES src/rvs_module.cpp src/action.cpp src/tfb_worker.cpp) + +## define target +add_library( ${RVS_TARGET} SHARED ${SOURCES}) +set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB}) +#add_dependencies(${RVS_TARGET} rvslibrt rvslib) +add_dependencies(${RVS_TARGET} rvslib) + +add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +COMMAND ln -fs ./lib${RVS}.so.${LIB_VERSION_STRING} lib${RVS}.so.${VERSION_MAJOR} WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} +COMMAND ln -fs ./lib${RVS}.so.${VERSION_MAJOR} lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} +) + +install(TARGETS ${RVS_TARGET} LIBRARY DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule) +install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR}" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule) +install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule) + +# TEST SECTION +#if (RVS_BUILD_TESTS) +# add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +# COMMAND ln -fs ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR} ${RVS_BINTEST_FOLDER}/lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} +# ) +# include(${CMAKE_CURRENT_SOURCE_DIR}/tests.cmake) +#endif() diff --git a/tfb.so/include/Compatibility.hpp b/tfb.so/include/Compatibility.hpp new file mode 100644 index 00000000..5e76cf50 --- /dev/null +++ b/tfb.so/include/Compatibility.hpp @@ -0,0 +1,93 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#if defined(__NVCC__) + +#include + +// ROCm specific +#define __builtin_amdgcn_s_memrealtime clock64 +#define gcnArchName name + +// Datatypes +#define hipDeviceProp_t cudaDeviceProp +#define hipError_t cudaError_t +#define hipEvent_t cudaEvent_t +#define hipStream_t cudaStream_t + +// Enumerations +#define hipDeviceAttributeClockRate cudaDevAttrClockRate +#define hipDeviceAttributeMaxSharedMemoryPerMultiprocessor cudaDevAttrMaxSharedMemoryPerMultiprocessor +#define hipDeviceAttributeMultiprocessorCount cudaDevAttrMultiProcessorCount +#define hipErrorPeerAccessAlreadyEnabled cudaErrorPeerAccessAlreadyEnabled +#define hipFuncCachePreferShared cudaFuncCachePreferShared +#define hipMemcpyDefault cudaMemcpyDefault +#define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define hipMemcpyHostToDevice cudaMemcpyHostToDevice +#define hipSuccess cudaSuccess + +// Functions +#define hipDeviceCanAccessPeer cudaDeviceCanAccessPeer +#define hipDeviceEnablePeerAccess cudaDeviceEnablePeerAccess +#define hipDeviceGetAttribute cudaDeviceGetAttribute +#define hipDeviceGetPCIBusId cudaDeviceGetPCIBusId +#define hipDeviceSetCacheConfig cudaDeviceSetCacheConfig +#define hipDeviceSynchronize cudaDeviceSynchronize +#define hipEventCreate cudaEventCreate +#define hipEventDestroy cudaEventDestroy +#define hipEventElapsedTime cudaEventElapsedTime +#define hipEventRecord cudaEventRecord +#define hipFree cudaFree +#define hipGetDeviceCount cudaGetDeviceCount +#define hipGetDeviceProperties cudaGetDeviceProperties +#define hipGetErrorString cudaGetErrorString +#define hipHostFree cudaFreeHost +#define hipHostMalloc cudaMallocHost +#define hipMalloc cudaMalloc +#define hipMemcpy cudaMemcpy +#define hipMemcpyAsync cudaMemcpyAsync +#define hipMemset cudaMemset +#define hipMemsetAsync cudaMemsetAsync +#define hipSetDevice cudaSetDevice +#define hipStreamCreate cudaStreamCreate +#define hipStreamDestroy cudaStreamDestroy +#define hipStreamSynchronize cudaStreamSynchronize + +// Define float4 addition operator for NVIDIA platform +__device__ inline float4& operator +=(float4& a, const float4& b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; + return a; +} + +#else + +#include +#include +#include + +#endif diff --git a/tfb.so/include/EnvVars.hpp b/tfb.so/include/EnvVars.hpp new file mode 100644 index 00000000..bac0746e --- /dev/null +++ b/tfb.so/include/EnvVars.hpp @@ -0,0 +1,510 @@ +/* +Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef ENVVARS_HPP +#define ENVVARS_HPP + +#include +#include +#include +#include "Compatibility.hpp" +#include "Kernels.hpp" + +#define TB_VERSION "1.22" + +extern char const MemTypeStr[]; +extern char const ExeTypeStr[]; + +enum ConfigModeEnum +{ + CFG_FILE = 0, + CFG_P2P = 1, + CFG_SWEEP = 2 +}; + +// This class manages environment variable that affect TransferBench +class EnvVars +{ +public: + // Default configuration values + int const DEFAULT_NUM_WARMUPS = 1; + int const DEFAULT_NUM_ITERATIONS = 10; + int const DEFAULT_SAMPLING_FACTOR = 1; + + // Peer-to-peer Benchmark preset defaults + int const DEFAULT_P2P_NUM_CPU_SE = 4; + + // Sweep-preset defaults + std::string const DEFAULT_SWEEP_SRC = "CG"; + std::string const DEFAULT_SWEEP_EXE = "CDG"; + std::string const DEFAULT_SWEEP_DST = "CG"; + int const DEFAULT_SWEEP_MIN = 1; + int const DEFAULT_SWEEP_MAX = 24; + int const DEFAULT_SWEEP_TEST_LIMIT = 0; + int const DEFAULT_SWEEP_TIME_LIMIT = 0; + + // Environment variables + int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy + int byteOffset; // Byte-offset for memory allocations + int continueOnError; // Continue tests even after mismatch detected + int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected) + int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected) + int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead + int numWarmups; // Number of un-timed warmup iterations to perform + int outputToCsv; // Output in CSV format + int samplingFactor; // Affects how many different values of N are generated (when N set to 0) + int sharedMemBytes; // Amount of shared memory to use per threadblock + int useInteractive; // Pause for user-input before starting transfer loop + int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device + int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern) + int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer + int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host + + std::vector fillPattern; // Pattern of floats used to fill source data + + // Environment variables only for Benchmark-preset + int useRemoteRead; // Use destination memory type as executor instead of source memory type + int useDmaCopy; // Use DMA copy instead of GPU copy + int numGpuSubExecs; // Number of GPU subexecutors to use + int numCpuSubExecs; // Number of CPU subexecttors to use + + // Environment variables only for Sweep-preset + int sweepMin; // Min number of simultaneous Transfers to be executed per test + int sweepMax; // Max number of simulatneous Transfers to be executed per test + int sweepTestLimit; // Max number of tests to run during sweep (0 = no limit) + int sweepTimeLimit; // Max number of seconds to run sweep for (0 = no limit) + int sweepXgmiMin; // Min number of XGMI hops for Transfers + int sweepXgmiMax; // Max number of XGMI hops for Transfers (-1 = no limit) + int sweepSeed; // Random seed to use + int sweepRandBytes; // Whether or not to use random number of bytes per Transfer + std::string sweepSrc; // Set of src memory types to be swept + std::string sweepExe; // Set of executors to be swept + std::string sweepDst; // Set of dst memory types to be swept + + // Developer features + int enableDebug; // Enable debug output + int gpuKernel; // Which GPU kernel to use + + // Used to track current configuration mode + ConfigModeEnum configMode; + + // Random generator + std::default_random_engine *generator; + + // Track how many CPUs are available per NUMA node + std::vector numCpusPerNuma; + + // Constructor that collects values + EnvVars() + { + int maxSharedMemBytes = 0; + HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes, + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0)); +#if !defined(__NVCC__) + int defaultSharedMemBytes = maxSharedMemBytes / 2 + 1; +#else + int defaultSharedMemBytes = 0; +#endif + + int numDeviceCUs = 0; + HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0)); + + int numDetectedCpus = numa_num_configured_nodes(); + int numDetectedGpus; + HIP_CALL(hipGetDeviceCount(&numDetectedGpus)); + + hipDeviceProp_t prop; + HIP_CALL(hipGetDeviceProperties(&prop, 0)); + std::string fullName = prop.gcnArchName; + std::string archName = fullName.substr(0, fullName.find(':')); + + // Different hardware pick different GPU kernels + // This performance difference is generally only noticable when executing fewer CUs + int defaultGpuKernel = 0; + if (archName == "gfx906") defaultGpuKernel = 13; + else if (archName == "gfx90a") defaultGpuKernel = 9; + + blockBytes = GetEnvVar("BLOCK_BYTES" , 256); + byteOffset = GetEnvVar("BYTE_OFFSET" , 0); + continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0); + numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus); + numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus); + numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS); + numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS); + outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0); + samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR); + sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , defaultSharedMemBytes); + useInteractive = GetEnvVar("USE_INTERACTIVE" , 0); + usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); + usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0); + useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0); + validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0); + enableDebug = GetEnvVar("DEBUG" , 0); + gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel); + + // P2P Benchmark related + useRemoteRead = GetEnvVar("USE_REMOTE_READ" , 0); + useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); + numGpuSubExecs = GetEnvVar("NUM_GPU_SE" , useDmaCopy ? 1 : numDeviceCUs); + numCpuSubExecs = GetEnvVar("NUM_CPU_SE" , DEFAULT_P2P_NUM_CPU_SE); + + // Sweep related + sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN); + sweepMax = GetEnvVar("SWEEP_MAX" , DEFAULT_SWEEP_MAX); + sweepSrc = GetEnvVar("SWEEP_SRC" , DEFAULT_SWEEP_SRC); + sweepExe = GetEnvVar("SWEEP_EXE" , DEFAULT_SWEEP_EXE); + sweepDst = GetEnvVar("SWEEP_DST" , DEFAULT_SWEEP_DST); + sweepTestLimit = GetEnvVar("SWEEP_TEST_LIMIT" , DEFAULT_SWEEP_TEST_LIMIT); + sweepTimeLimit = GetEnvVar("SWEEP_TIME_LIMIT" , DEFAULT_SWEEP_TIME_LIMIT); + sweepXgmiMin = GetEnvVar("SWEEP_XGMI_MIN" , 0); + sweepXgmiMax = GetEnvVar("SWEEP_XGMI_MAX" , -1); + sweepRandBytes = GetEnvVar("SWEEP_RAND_BYTES" , 0); + + // Determine random seed + char *sweepSeedStr = getenv("SWEEP_SEED"); + sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL)); + generator = new std::default_random_engine(sweepSeed); + + // Check for fill pattern + char* pattern = getenv("FILL_PATTERN"); + if (pattern != NULL) + { + if (usePrepSrcKernel) + { + printf("[ERROR] Unable to use FILL_PATTERN and USE_PREP_KERNEL together\n"); + exit(1); + } + + int patternLen = strlen(pattern); + if (patternLen % 2) + { + printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n"); + exit(1); + } + + // Read in bytes + std::vector bytes; + unsigned char val = 0; + for (int i = 0; i < patternLen; i++) + { + if ('0' <= pattern[i] && pattern[i] <= '9') + val += (pattern[i] - '0'); + else if ('A' <= pattern[i] && pattern[i] <= 'F') + val += (pattern[i] - 'A' + 10); + else if ('a' <= pattern[i] && pattern[i] <= 'f') + val += (pattern[i] - 'a' + 10); + else + { + printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F). (not %c)\n", pattern[i]); + exit(1); + } + + if (i % 2 == 0) + val <<= 4; + else + { + bytes.push_back(val); + val = 0; + } + } + + // Reverse bytes (input is assumed to be given in big-endian) + std::reverse(bytes.begin(), bytes.end()); + + // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly + int copies; + switch (patternLen % 8) + { + case 0: copies = 1; break; + case 4: copies = 2; break; + default: copies = 4; break; + } + + // Fill floats + int numFloats = copies * patternLen / 8; + fillPattern.resize(numFloats); + unsigned char* rawData = (unsigned char*) fillPattern.data(); + for (int i = 0; i < numFloats * 4; i++) + rawData[i] = bytes[i % bytes.size()]; + } + else fillPattern.clear(); + + // Perform some basic validation + if (numCpuDevices > numDetectedCpus) + { + printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus); + exit(1); + } + if (numGpuDevices > numDetectedGpus) + { + printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus); + exit(1); + } + if (byteOffset % sizeof(float)) + { + printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float)); + exit(1); + } + if (numWarmups < 0) + { + printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n"); + exit(1); + } + if (samplingFactor < 1) + { + printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n"); + exit(1); + } + if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes) + { + printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes); + exit(1); + } + if (blockBytes <= 0 || blockBytes % 4) + { + printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n"); + exit(1); + } + + if (numGpuSubExecs <= 0) + { + printf("[ERROR] NUM_GPU_SE must be greater than 0\n"); + exit(1); + } + + if (numCpuSubExecs <= 0) + { + printf("[ERROR] NUM_CPU_SE must be greater than 0\n"); + exit(1); + } + + for (auto ch : sweepSrc) + { + if (!strchr(MemTypeStr, ch)) + { + printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch); + exit(1); + } + if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) + { + printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch); + exit(1); + } + } + + for (auto ch : sweepDst) + { + if (!strchr(MemTypeStr, ch)) + { + printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch); + exit(1); + } + if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) + { + printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch); + exit(1); + } + } + + for (auto ch : sweepExe) + { + if (!strchr(ExeTypeStr, ch)) + { + printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch); + exit(1); + } + if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) + { + printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch); + exit(1); + } + } + if (gpuKernel < 0 || gpuKernel > NUM_GPU_KERNELS) + { + printf("[ERROR] GPU kernel must be between 0 and %d\n", NUM_GPU_KERNELS); + exit(1); + } + + // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs) + numCpusPerNuma.resize(numDetectedCpus); + int const totalCpus = numa_num_configured_cpus(); + for (int i = 0; i < totalCpus; i++) + numCpusPerNuma[numa_node_of_cpu(i)]++; + + // Check for deprecated env vars + if (getenv("USE_HIP_CALL")) + { + printf("[WARN] USE_HIP_CALL has been deprecated. Please use DMA executor 'D' or set USE_GPU_DMA for P2P-Benchmark preset\n"); + exit(1); + } + + char* enableSdma = getenv("HSA_ENABLE_SDMA"); + if (enableSdma && !strcmp(enableSdma, "0")) + { + printf("[WARN] DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0. Copies will fallback to blit kernels\n"); + } + } + + // Display info on the env vars that can be used + static void DisplayUsage() + { + printf("Environment variables:\n"); + printf("======================\n"); + printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n"); + printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n"); + printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n"); + printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n"); + printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n"); + printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n"); + printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n"); + printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n"); + printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); + printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n"); + printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n"); + printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); + printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n"); + printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n"); + printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n"); + printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n"); + } + + // Helper macro to switch between CSV and terminal output +#define PRINT_EV(NAME, VALUE, DESCRIPTION) \ + printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ", (DESCRIPTION).c_str()) + +#define PRINT_ES(NAME, VALUE, DESCRIPTION) \ + printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ", (DESCRIPTION).c_str()) + + // Display env var settings + void DisplayEnvVars() const + { + if (!outputToCsv) + { + printf("TransferBench v%s\n", TB_VERSION); + printf("=====================================================\n"); + printf("[Common]\n"); + } + else + printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION); + + PRINT_EV("BLOCK_BYTES", blockBytes, + std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy")); + PRINT_EV("BYTE_OFFSET", byteOffset, + std::string("Using byte offset of " + std::to_string(byteOffset))); + PRINT_EV("CONTINUE_ON_ERROR", continueOnError, + std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error")); + PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0, + (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString())); + PRINT_EV("GPU_KERNEL", gpuKernel, + std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]"); + PRINT_EV("NUM_CPU_DEVICES", numCpuDevices, + std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices"); + PRINT_EV("NUM_GPU_DEVICES", numGpuDevices, + std::string("Using ") + std::to_string(numGpuDevices) + " GPU devices"); + PRINT_EV("NUM_ITERATIONS", numIterations, + std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " " + + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test")); + PRINT_EV("NUM_WARMUPS", numWarmups, + std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test")); + PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes, + std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock")); + PRINT_EV("USE_INTERACTIVE", useInteractive, + std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode"); + PRINT_EV("USE_PCIE_INDEX", usePcieIndexing, + std::string("Use ") + (usePcieIndexing ? "PCIe" : "HIP") + " GPU device indexing"); + PRINT_EV("USE_PREP_KERNEL", usePrepSrcKernel, + std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data"); + PRINT_EV("USE_SINGLE_STREAM", useSingleStream, + std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer")); + PRINT_EV("VALIDATE_DIRECT", validateDirect, + std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer")); + printf("\n"); + }; + + // Display env var for P2P Benchmark preset + void DisplayP2PBenchmarkEnvVars() const + { + DisplayEnvVars(); + + if (!outputToCsv) + printf("[P2P Related]\n"); + + PRINT_EV("NUM_CPU_SE", numCpuSubExecs, + std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors"); + PRINT_EV("NUM_GPU_SE", numGpuSubExecs, + std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors"); + PRINT_EV("USE_GPU_DMA", useDmaCopy, + std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor"); + PRINT_EV("USE_REMOTE_READ", useRemoteRead, + std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor"); + printf("\n"); + } + + // Display env var settings + void DisplaySweepEnvVars() const + { + DisplayEnvVars(); + + if (!outputToCsv) + printf("[Sweep Related]\n"); + PRINT_ES("SWEEP_DST", sweepDst.c_str(), + std::string("Destination Memory Types to sweep")); + PRINT_ES("SWEEP_EXE", sweepExe.c_str(), + std::string("Executor Types to sweep")); + PRINT_EV("SWEEP_MAX", sweepMax, + std::string("Max simultaneous transfers (0 = no limit)")); + PRINT_EV("SWEEP_MIN", sweepMin, + std::string("Min simultaenous transfers")); + PRINT_EV("SWEEP_RAND_BYTES", sweepRandBytes, + std::string("Using ") + (sweepRandBytes ? "random" : "constant") + " number of bytes per Transfer"); + PRINT_EV("SWEEP_SEED", sweepSeed, + std::string("Random seed set to ") + std::to_string(sweepSeed)); + PRINT_ES("SWEEP_SRC", sweepSrc.c_str(), + std::string("Source Memory Types to sweep")); + PRINT_EV("SWEEP_TEST_LIMIT", sweepTestLimit, + std::string("Max number of tests to run during sweep (0 = no limit)")); + PRINT_EV("SWEEP_TIME_LIMIT", sweepTimeLimit, + std::string("Max number of seconds to run sweep for (0 = no limit)")); + PRINT_EV("SWEEP_XGMI_MAX", sweepXgmiMax, + std::string("Max number of XGMI hops for Transfers (-1 = no limit)")); + PRINT_EV("SWEEP_XGMI_MIN", sweepXgmiMin, + std::string("Min number of XGMI hops for Transfers")); + printf("\n"); + } + + // Helper function that gets parses environment variable or sets to default value + static int GetEnvVar(std::string const& varname, int defaultValue) + { + if (getenv(varname.c_str())) + return atoi(getenv(varname.c_str())); + return defaultValue; + } + + static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue) + { + if (getenv(varname.c_str())) + return getenv(varname.c_str()); + return defaultValue; + } +}; + +#endif diff --git a/tfb.so/include/GetClosestNumaNode.hpp b/tfb.so/include/GetClosestNumaNode.hpp new file mode 100644 index 00000000..266f1eb1 --- /dev/null +++ b/tfb.so/include/GetClosestNumaNode.hpp @@ -0,0 +1,149 @@ +/* +Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Helper macro for checking HSA calls +#define HSA_CHECK(cmd) \ + do { \ + hsa_status_t error = (cmd); \ + if (error != HSA_STATUS_SUCCESS) { \ + const char* errString = NULL; \ + hsa_status_string(error, &errString); \ + std::cerr << "Encountered HSA error (" << errString << ") at line " \ + << __LINE__ << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while (0) + +// Structure to hold HSA agent information +#if !defined(__NVCC__) +struct AgentData +{ + bool isInitialized; + std::vector cpuAgents; + std::vector gpuAgents; + std::vector closestNumaNode; +}; + +// Simple callback function to return any memory pool for an agent +hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data) +{ + hsa_amd_memory_pool_t* poolData = reinterpret_cast(data); + + // Check memory pool flags + uint32_t poolFlags; + HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags)); + + // Only consider coarse-grained pools + if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS; + + *poolData = pool; + return HSA_STATUS_SUCCESS; +} + +// Callback function to gather HSA agent information +hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data) +{ + AgentData* agentData = reinterpret_cast(data); + + // Get the device type + hsa_device_type_t deviceType; + HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType)); + if (deviceType == HSA_DEVICE_TYPE_CPU) + agentData->cpuAgents.push_back(agent); + if (deviceType == HSA_DEVICE_TYPE_GPU) + { + agentData->gpuAgents.push_back(agent); + agentData->closestNumaNode.push_back(0); + } + + return HSA_STATUS_SUCCESS; +} + +AgentData& GetAgentData() +{ + static AgentData agentData = {}; + + if (!agentData.isInitialized) + { + agentData.isInitialized = true; + + // Add all detected agents to the list + HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData)); + + // Loop over each GPU + for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++) + { + // Collect memory pool + hsa_amd_memory_pool_t pool; + HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool)); + + // Loop over each CPU agent and check distance + int bestDistance = -1; + for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++) + { + // Determine number of hops from GPU memory pool to CPU agent + uint32_t hops = 0; + HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j], + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, + &hops)); + // Gather link info + hsa_amd_memory_pool_link_info_t* link_info = + (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t)); + HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j], + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, + link_info)); + int numaDist = 0; + for (int k = 0; k < hops; k++) + { + numaDist += link_info[k].numa_distance; + } + if (bestDistance == -1 || numaDist < bestDistance) + { + agentData.closestNumaNode[i] = j; + bestDistance = numaDist; + } + free(link_info); + } + } + } + return agentData; +} +#endif + +// Returns closest CPU NUMA node to provided GPU +// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing +int GetClosestNumaNode(int gpuIdx) +{ +#if defined(__NVCC__) + return -1; +#else + AgentData& agentData = GetAgentData(); + if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size()) + { + printf("[ERROR] GPU index out is out of bounds\n"); + exit(1); + } + return agentData.closestNumaNode[gpuIdx]; +#endif +} diff --git a/tfb.so/include/Kernels.hpp b/tfb.so/include/Kernels.hpp new file mode 100644 index 00000000..dcb6bf07 --- /dev/null +++ b/tfb.so/include/Kernels.hpp @@ -0,0 +1,409 @@ +/* +Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#define PackedFloat_t float4 +#define WARP_SIZE 64 +#define BLOCKSIZE 256 +#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float)) +#define MEMSET_CHAR 75 +#define MEMSET_VAL 13323083.0f + +// Each subExecutor is provided with subarrays to work on +#define MAX_SRCS 16 +#define MAX_DSTS 16 +struct SubExecParam +{ + size_t N; // Number of floats this subExecutor works on + int numSrcs; // Number of source arrays + int numDsts; // Number of destination arrays + float* src[MAX_SRCS]; // Source array pointers + float* dst[MAX_DSTS]; // Destination array pointers + long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor) + long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor) +}; + +void CpuReduceKernel(SubExecParam const& p) +{ + int const& numSrcs = p.numSrcs; + int const& numDsts = p.numDsts; + + if (numSrcs == 0) + { + for (int i = 0; i < numDsts; ++i) + memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float)); + } + else if (numSrcs == 1) + { + float const* __restrict__ src = p.src[0]; + for (int i = 0; i < numDsts; ++i) + { + memcpy(p.dst[i], src, p.N * sizeof(float)); + } + } + else + { + for (int j = 0; j < p.N; j++) + { + float sum = p.src[0][j]; + for (int i = 1; i < numSrcs; i++) sum += p.src[i][j]; + for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum; + } + } +} + +std::string PrepSrcValueString() +{ + return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)"; +} + +__host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx) +{ + return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1); +} + +// GPU kernel to prepare src buffer data +__global__ void +PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx) +{ + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < N; + idx += blockDim.x * gridDim.x) + { + ptr[idx] = PrepSrcValue(srcBufferIdx, idx); + } +} + +// Helper function for memset +template __device__ __forceinline__ T MemsetVal(); +template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; }; +template <> __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); } + +// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats +template +__global__ void __launch_bounds__(BLOCKSIZE) +GpuReduceKernel(SubExecParam* params) +{ + int64_t startCycle = wall_clock64(); + + // Operate on wavefront granularity + SubExecParam& p = params[blockIdx.x]; + int const numSrcs = p.numSrcs; + int const numDsts = p.numDsts; + int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number + int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront + + // 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration + // Determine the number of packed floats processed by the first loop + size_t Nrem = p.N; + size_t const loop1Npack = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE); + size_t const loop1Nelem = loop1Npack * FLOATS_PER_PACK; + size_t const loop1Inc = BLOCKSIZE * LOOP1_UNROLL; + size_t loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId; + + while (loop1Offset < loop1Npack) + { + PackedFloat_t vals[LOOP1_UNROLL] = {}; + + if (numSrcs == 0) + { + #pragma unroll + for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal(); + } + else + { + for (int i = 0; i < numSrcs; ++i) + { + PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset; + #pragma unroll + for (int u = 0; u < LOOP1_UNROLL; ++u) + vals[u] += *(packedSrc + u * WARP_SIZE); + } + } + + for (int i = 0; i < numDsts; ++i) + { + PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset; + #pragma unroll + for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u]; + } + loop1Offset += loop1Inc; + } + Nrem -= loop1Nelem; + + if (Nrem > 0) + { + // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration + // NOTE: Using int32_t due to smaller size requirements + int32_t const loop2Npack = Nrem / FLOATS_PER_PACK; + int32_t const loop2Nelem = loop2Npack * FLOATS_PER_PACK; + int32_t const loop2Inc = BLOCKSIZE; + int32_t loop2Offset = threadIdx.x; + + while (loop2Offset < loop2Npack) + { + PackedFloat_t val; + if (numSrcs == 0) + { + val = MemsetVal(); + } + else + { + val = {}; + for (int i = 0; i < numSrcs; ++i) + { + PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset; + val += *packedSrc; + } + } + + for (int i = 0; i < numDsts; ++i) + { + PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset; + *packedDst = val; + } + loop2Offset += loop2Inc; + } + Nrem -= loop2Nelem; + + // Deal with leftovers less than FLOATS_PER_PACK) + if (threadIdx.x < Nrem) + { + int offset = loop1Nelem + loop2Nelem + threadIdx.x; + float val = 0; + if (numSrcs == 0) + { + val = MEMSET_VAL; + } + else + { + for (int i = 0; i < numSrcs; ++i) + val += p.src[i][offset]; + } + + for (int i = 0; i < numDsts; ++i) + p.dst[i][offset] = val; + } + } + + __syncthreads(); + if (threadIdx.x == 0) + { + p.startCycle = startCycle; + p.stopCycle = wall_clock64(); + } +} + +template +__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N) +{ + int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread + size_t constexpr loopPackInc = BLOCKSIZE * UNROLL_FACTOR; + size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR; + int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number + int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront + int const numSrcs = p.numSrcs; + int const numDsts = p.numDsts; + size_t const numPacksDone = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave; + size_t const numFloatsLeft = N - numPacksDone * numFloatsPerPack; + size_t loopPackOffset = waveId * numPacksPerWave + threadId; + + while (loopPackOffset < numPacksDone) + { + FLOAT_TYPE vals[UNROLL_FACTOR]; + + if (numSrcs == 0) + { + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal(); + } + else + { + FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset; + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + vals[u] = *(src0Ptr + u * WARP_SIZE); + + for (int i = 1; i < numSrcs; ++i) + { + FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset; + + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + vals[u] += *(srcPtr + u * WARP_SIZE); + } + } + + for (int i = 0; i < numDsts; ++i) + { + FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset; + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + *(dstPtr + u * WARP_SIZE) = vals[u]; + } + loopPackOffset += loopPackInc; + } + + return numFloatsLeft; +} + +template +__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N) +{ + // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop + int constexpr numFloatsPerRead = sizeof(FLOAT_TYPE) / sizeof(float); + size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead; + size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR; + size_t const numFloatsLeft = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop; + size_t const numFloatsDone = N - numFloatsLeft; + int const numSrcs = p.numSrcs; + int const numDsts = p.numDsts; + + for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop) + { + FLOAT_TYPE tmp[UNROLL_FACTOR]; + + if (numSrcs == 0) + { + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + tmp[u] = MemsetVal(); + } + else + { + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop])); + + for (int i = 1; i < numSrcs; ++i) + { + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop])); + } + } + + for (int i = 0; i < numDsts; ++i) + { + for (int u = 0; u < UNROLL_FACTOR; ++u) + { + *((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u]; + } + } + } + return numFloatsLeft; +} + +template +__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll) +{ + switch (unroll) + { + case 1: return GpuReduceFuncImpl(p, offset, N); + case 2: return GpuReduceFuncImpl(p, offset, N); + case 3: return GpuReduceFuncImpl(p, offset, N); + case 4: return GpuReduceFuncImpl(p, offset, N); + case 5: return GpuReduceFuncImpl(p, offset, N); + case 6: return GpuReduceFuncImpl(p, offset, N); + case 7: return GpuReduceFuncImpl(p, offset, N); + case 8: return GpuReduceFuncImpl(p, offset, N); + case 9: return GpuReduceFuncImpl(p, offset, N); + case 10: return GpuReduceFuncImpl(p, offset, N); + case 11: return GpuReduceFuncImpl(p, offset, N); + case 12: return GpuReduceFuncImpl(p, offset, N); + case 13: return GpuReduceFuncImpl(p, offset, N); + case 14: return GpuReduceFuncImpl(p, offset, N); + case 15: return GpuReduceFuncImpl(p, offset, N); + case 16: return GpuReduceFuncImpl(p, offset, N); + default: return GpuReduceFuncImpl(p, offset, N); + } +} + +// GPU copy kernel +__global__ void __launch_bounds__(BLOCKSIZE) +GpuReduceKernel2(SubExecParam* params) +{ + int64_t startCycle = wall_clock64(); + SubExecParam& p = params[blockIdx.x]; + + size_t numFloatsLeft = GpuReduceFunc(p, 0, p.N, 8); + if (numFloatsLeft) + numFloatsLeft = GpuReduceFunc(p, p.N - numFloatsLeft, numFloatsLeft, 1); + + if (numFloatsLeft) + GpuReduceFunc(p, p.N - numFloatsLeft, numFloatsLeft, 1); + + __threadfence_system(); + if (threadIdx.x == 0) + { + p.startCycle = startCycle; + p.stopCycle = wall_clock64(); + } +} + +#define NUM_GPU_KERNELS 18 +typedef void (*GpuKernelFuncPtr)(SubExecParam*); + +GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] = +{ + GpuReduceKernel<8>, + GpuReduceKernel<1>, + GpuReduceKernel<2>, + GpuReduceKernel<3>, + GpuReduceKernel<4>, + GpuReduceKernel<5>, + GpuReduceKernel<6>, + GpuReduceKernel<7>, + GpuReduceKernel<8>, + GpuReduceKernel<9>, + GpuReduceKernel<10>, + GpuReduceKernel<11>, + GpuReduceKernel<12>, + GpuReduceKernel<13>, + GpuReduceKernel<14>, + GpuReduceKernel<15>, + GpuReduceKernel<16>, + GpuReduceKernel2 +}; + +std::string GpuKernelNames[NUM_GPU_KERNELS] = +{ + "Default - 8xUnroll", + "Unroll x1", + "Unroll x2", + "Unroll x3", + "Unroll x4", + "Unroll x5", + "Unroll x6", + "Unroll x7", + "Unroll x8", + "Unroll x9", + "Unroll x10", + "Unroll x11", + "Unroll x12", + "Unroll x13", + "Unroll x14", + "Unroll x15", + "Unroll x16", + "8xUnrollB", +}; diff --git a/tfb.so/include/TransferBench.hpp b/tfb.so/include/TransferBench.hpp new file mode 100644 index 00000000..6abbbc5b --- /dev/null +++ b/tfb.so/include/TransferBench.hpp @@ -0,0 +1,198 @@ +/* +Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Compatibility.hpp" + +// Helper macro for catching HIP errors +#define HIP_CALL(cmd) \ + do { \ + hipError_t error = (cmd); \ + if (error != hipSuccess) \ + { \ + std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \ + << ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while (0) + +#include "EnvVars.hpp" + +// Simple configuration parameters +size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26); // Amount of data transferred per Transfer + +// Different src/dst memory types supported +typedef enum +{ + MEM_CPU = 0, // Coarse-grained pinned CPU memory + MEM_GPU = 1, // Coarse-grained global GPU memory + MEM_CPU_FINE = 2, // Fine-grained pinned CPU memory + MEM_GPU_FINE = 3, // Fine-grained global GPU memory + MEM_CPU_UNPINNED = 4, // Unpinned CPU memory + MEM_NULL = 5, // NULL memory - used for empty +} MemType; + +typedef enum +{ + EXE_CPU = 0, // CPU executor (subExecutor = CPU thread) + EXE_GPU_GFX = 1, // GPU kernel-based executor (subExecutor = threadblock/CU) + EXE_GPU_DMA = 2, // GPU SDMA-based executor (subExecutor = streams) +} ExeType; + +bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); } +bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); }; +bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); }; +bool IsCpuType(ExeType e) { return (e == EXE_CPU); }; + +char const MemTypeStr[7] = "CGBFUN"; +char const ExeTypeStr[4] = "CGD"; +char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"}; + +MemType inline CharToMemType(char const c) +{ + char const* val = strchr(MemTypeStr, toupper(c)); + if (*val) return (MemType)(val - MemTypeStr); + printf("[ERROR] Unexpected memory type (%c)\n", c); + exit(1); +} + +ExeType inline CharToExeType(char const c) +{ + char const* val = strchr(ExeTypeStr, toupper(c)); + if (*val) return (ExeType)(val - ExeTypeStr); + printf("[ERROR] Unexpected executor type (%c)\n", c); + exit(1); +} + +// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified) +// then writes the summation to each of the specified destination memory location(s) +struct Transfer +{ + int transferIndex; // Transfer identifier (within a Test) + ExeType exeType; // Transfer executor type + int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU) + int numSubExecs; // Number of subExecutors to use for this Transfer + size_t numBytes; // # of bytes requested to Transfer (may be 0 to fallback to default) + size_t numBytesActual; // Actual number of bytes to copy + double transferTime; // Time taken in milliseconds + + int numSrcs; // Number of sources + std::vector srcType; // Source memory types + std::vector srcIndex; // Source device indice + std::vector srcMem; // Source memory + + int numDsts; // Number of destinations + std::vector dstType; // Destination memory type + std::vector dstIndex; // Destination device index + std::vector dstMem; // Destination memory + + std::vector subExecParam; // Defines subarrays assigned to each threadblock + SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam + + // Prepares src/dst subarray pointers for each SubExecutor + void PrepareSubExecParams(EnvVars const& ev); + + // Prepare source arrays with input data + bool PrepareSrc(EnvVars const& ev); + + // Validate that destination data contains expected results + void ValidateDst(EnvVars const& ev); + + // Prepare reference buffers + void PrepareReference(EnvVars const& ev, std::vector& buffer, int bufferIdx); + + // String representation functions + std::string SrcToStr() const; + std::string DstToStr() const; +}; + +struct ExecutorInfo +{ + std::vector transfers; // Transfers to execute + size_t totalBytes; // Total bytes this executor transfers + int totalSubExecs; // Total number of subExecutors to use + + // For GPU-Executors + SubExecParam* subExecParamGpu; // GPU copy of subExecutor parameters + std::vector streams; + std::vector startEvents; + std::vector stopEvents; + + // Results + double totalTime; +}; + +typedef std::pair Executor; +typedef std::map TransferMap; + +// Display usage instructions +void DisplayUsage(char const* cmdName); + +// Display detected GPU topology / CPU numa nodes +void DisplayTopology(bool const outputToCsv); + +// Build array of test sizes based on sampling factor +void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor, + std::vector& valuesofN); + +void ParseMemType(std::string const& token, int const numCpus, int const numGpus, + std::vector& memType, std::vector& memIndex); +void ParseExeType(std::string const& token, int const numCpus, int const numGpus, + ExeType& exeType, int& exeIndex); + +void ParseTransfers(char* line, int numCpus, int numGpus, + std::vector& transfers); + +void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N, + std::vector& transfers, bool verbose = true); + +void EnablePeerAccess(int const deviceId, int const peerDeviceId); +void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr); +void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0); +void CheckPages(char* byteArray, size_t numBytes, int targetId); +void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx); +void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N); +void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom); + +// Return the maximum bandwidth measured for given (src/dst) pair +double GetPeakBandwidth(EnvVars const& ev, size_t const N, + int const isBidirectional, + MemType const srcType, int const srcIndex, + MemType const dstType, int const dstIndex); + +std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount); + +int RemappedIndex(int const origIdx, bool const isCpuType); +int GetWallClockRate(int deviceId); +void LogTransfers(FILE *fp, int const testNum, std::vector const& transfers); +std::string PtrVectorToStr(std::vector const& strVector, int const initOffset); diff --git a/tfb.so/include/action.h b/tfb.so/include/action.h new file mode 100644 index 00000000..7fdf2b35 --- /dev/null +++ b/tfb.so/include/action.h @@ -0,0 +1,86 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef HIPTEST_SO_INCLUDE_ACTION_H_ +#define HIPTEST_SO_INCLUDE_ACTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif +#include +#ifdef __cplusplus +} +#endif + +#include +#include +#include + +#include "include/rvsactionbase.h" + +using std::vector; +using std::string; +using std::map; + +/** + * @class tfb_action + * @ingroup GST + * + * @brief GST action implementation class + * + * Derives from rvs::actionbase and implements actual action functionality + * in its run() method. + * + */ +class tfb_action: public rvs::actionbase { + public: + tfb_action(); + virtual ~tfb_action(); + + virtual int run(void); + + std::string m_transfer_type; + + protected: + //! TRUE if JSON output is required + bool bjson; + + bool get_all_tfb_config_keys(void); + /** + * @brief reads all common configuration keys from + * the module's properties collection + * @return true if no fatal error occured, false otherwise + */ + bool get_all_common_config_keys(void); + + int get_num_amd_gpu_devices(void); + bool start_tfb_runners(); + /** + * @brief gets the number of ROCm compatible AMD GPUs + * @return run number of GPUs + */ + int run_transferbench(void); +}; + +#endif // HIPTEST_SO_INCLUDE_ACTION_H_ diff --git a/tfb.so/include/rvs_module.h b/tfb.so/include/rvs_module.h new file mode 100644 index 00000000..9edaec65 --- /dev/null +++ b/tfb.so/include/rvs_module.h @@ -0,0 +1,31 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef HIPTEST_SO_INCLUDE_RVS_MODULE_H_ +#define HIPTEST_SO_INCLUDE_RVS_MODULE_H_ + +#include "include/rvsliblog.h" + + +#endif // HIPTEST_SO_INCLUDE_RVS_MODULE_H_ diff --git a/tfb.so/include/tfb_worker.h b/tfb.so/include/tfb_worker.h new file mode 100644 index 00000000..10b1cfac --- /dev/null +++ b/tfb.so/include/tfb_worker.h @@ -0,0 +1,72 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_ +#define HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_ + +#include +#include +#include "include/rvsthreadbase.h" +#include "include/rvs_blas.h" +#include "include/rvs_util.h" + +#define GST_RESULT_PASS_MESSAGE "true" +#define GST_RESULT_FAIL_MESSAGE "false" + + +/** + * @class tfbWorker + * @ingroup GST + * + * @brief tfbWorker action implementation class + * + * Derives from rvs::ThreadBase and implements actual action functionality + * in its run() method. + * + */ +class tfbWorker : public rvs::ThreadBase { + public: + tfbWorker(); + virtual ~tfbWorker(); + + //! sets action name + void set_name(const std::string& name) { action_name = name; } + //! returns action name + const std::string& get_name(void) { return action_name; } + + //! sets test path + void set_transfer_type(std::string type) { m_tfr_type = type; } + + const std::string& get_transfer_type(void) { return m_tfr_type; } + int TfbRun(int &error, std::string &errdesc); + protected: + virtual void run(void); + protected: + //! name of the action + std::string action_name; + //! path to execute test + std::string m_tfr_type; +}; + +#endif // HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_ diff --git a/tfb.so/src/.gitignore b/tfb.so/src/.gitignore new file mode 100644 index 00000000..6677c873 --- /dev/null +++ b/tfb.so/src/.gitignore @@ -0,0 +1 @@ +/libmain.cpp diff --git a/tfb.so/src/TransferBench.cpp b/tfb.so/src/TransferBench.cpp new file mode 100644 index 00000000..51c7f68f --- /dev/null +++ b/tfb.so/src/TransferBench.cpp @@ -0,0 +1,1814 @@ +/* +Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// This program measures simultaneous copy performance across multiple GPUs +// on the same node +#include +#include +#include +#include +#include + +#include "TransferBench.hpp" +#include "GetClosestNumaNode.hpp" + + + +void ExecuteTransfers(EnvVars const& ev, + int const testNum, + size_t const N, + std::vector& transfers, + bool verbose) +{ + int const initOffset = ev.byteOffset / sizeof(float); + + // Map transfers by executor + TransferMap transferMap; + for (Transfer& transfer : transfers) + { + Executor executor(transfer.exeType, transfer.exeIndex); + ExecutorInfo& executorInfo = transferMap[executor]; + executorInfo.transfers.push_back(&transfer); + } + + // Loop over each executor and prepare sub-executors + std::map transferList; + for (auto& exeInfoPair : transferMap) + { + Executor const& executor = exeInfoPair.first; + ExecutorInfo& exeInfo = exeInfoPair.second; + ExeType const exeType = executor.first; + int const exeIndex = RemappedIndex(executor.second, IsCpuType(exeType)); + + exeInfo.totalTime = 0.0; + exeInfo.totalSubExecs = 0; + + // Loop over each transfer this executor is involved in + for (Transfer* transfer : exeInfo.transfers) + { + // Determine how many bytes to copy for this Transfer (use custom if pre-specified) + transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float)); + + // Allocate source memory + transfer->srcMem.resize(transfer->numSrcs); + for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc) + { + MemType const& srcType = transfer->srcType[iSrc]; + int const srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType)); + + // Ensure executing GPU can access source memory + if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex) + EnablePeerAccess(exeIndex, srcIndex); + + AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]); + } + + // Allocate destination memory + transfer->dstMem.resize(transfer->numDsts); + for (int iDst = 0; iDst < transfer->numDsts; ++iDst) + { + MemType const& dstType = transfer->dstType[iDst]; + int const dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType)); + + // Ensure executing GPU can access destination memory + if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex) + EnablePeerAccess(exeIndex, dstIndex); + + AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]); + } + + exeInfo.totalSubExecs += transfer->numSubExecs; + transferList[transfer->transferIndex] = transfer; + } + + // Prepare additional requirement for GPU-based executors + if (IsGpuType(exeType)) + { + HIP_CALL(hipSetDevice(exeIndex)); + + // Single-stream is only supported for GFX-based executors + int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1; + exeInfo.streams.resize(numStreamsToUse); + exeInfo.startEvents.resize(numStreamsToUse); + exeInfo.stopEvents.resize(numStreamsToUse); + for (int i = 0; i < numStreamsToUse; ++i) + { + HIP_CALL(hipStreamCreate(&exeInfo.streams[i])); + HIP_CALL(hipEventCreate(&exeInfo.startEvents[i])); + HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i])); + } + + if (exeType == EXE_GPU_GFX) + { + // Allocate one contiguous chunk of GPU memory for threadblock parameters + // This allows support for executing one transfer per stream, or all transfers in a single stream + AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam), + (void**)&exeInfo.subExecParamGpu); + } + } + } + + if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum); + + // Prepare input memory and block parameters for current N + bool isSrcCorrect = true; + for (auto& exeInfoPair : transferMap) + { + Executor const& executor = exeInfoPair.first; + ExecutorInfo& exeInfo = exeInfoPair.second; + ExeType const exeType = executor.first; + int const exeIndex = RemappedIndex(executor.second, IsCpuType(exeType)); + + exeInfo.totalBytes = 0; + + int transferOffset = 0; + for (int i = 0; i < exeInfo.transfers.size(); ++i) + { + // Prepare subarrays each threadblock works on and fill src memory with patterned data + Transfer* transfer = exeInfo.transfers[i]; + transfer->PrepareSubExecParams(ev); + isSrcCorrect &= transfer->PrepareSrc(ev); + exeInfo.totalBytes += transfer->numBytesActual; + + // Copy block parameters to GPU for GPU executors + if (transfer->exeType == EXE_GPU_GFX) + { + exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset; + HIP_CALL(hipSetDevice(exeIndex)); + HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset], + transfer->subExecParam.data(), + transfer->subExecParam.size() * sizeof(SubExecParam), + hipMemcpyHostToDevice)); + HIP_CALL(hipDeviceSynchronize()); + + transferOffset += transfer->subExecParam.size(); + } + } + } + + // Launch kernels (warmup iterations are not counted) + double totalCpuTime = 0; + size_t numTimedIterations = 0; + std::stack threads; + for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++) + { + if (ev.numIterations > 0 && iteration >= ev.numIterations) break; + if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break; + + // Pause before starting first timed iteration in interactive mode + if (verbose && ev.useInteractive && iteration == 0) + { + printf("Memory prepared:\n"); + + for (Transfer& transfer : transfers) + { + printf("Transfer %03d:\n", transfer.transferIndex); + for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc) + printf(" SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]); + for (int iDst = 0; iDst < transfer.numDsts; ++iDst) + printf(" DST %0d: %p\n", iDst, transfer.dstMem[iDst]); + } + printf("Hit to continue: "); + if (scanf("%*c") != 0) + { + printf("[ERROR] Unexpected input\n"); + exit(1); + } + printf("\n"); + } + + // Start CPU timing for this iteration + auto cpuStart = std::chrono::high_resolution_clock::now(); + + // Execute all Transfers in parallel + for (auto& exeInfoPair : transferMap) + { + ExecutorInfo& exeInfo = exeInfoPair.second; + ExeType exeType = exeInfoPair.first.first; + int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size(); + + for (int i = 0; i < numTransfersToRun; ++i) + threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i)); + } + + // Wait for all threads to finish + int const numTransfers = threads.size(); + for (int i = 0; i < numTransfers; i++) + { + threads.top().join(); + threads.pop(); + } + + // Stop CPU timing for this iteration + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + double deltaSec = std::chrono::duration_cast>(cpuDelta).count(); + + if (iteration >= 0) + { + ++numTimedIterations; + totalCpuTime += deltaSec; + } + } + + // Pause for interactive mode + if (verbose && isSrcCorrect && ev.useInteractive) + { + printf("Transfers complete. Hit to continue: "); + if (scanf("%*c") != 0) + { + printf("[ERROR] Unexpected input\n"); + exit(1); + } + printf("\n"); + } + + // Validate that each transfer has transferred correctly + size_t totalBytesTransferred = 0; + int const numTransfers = transferList.size(); + for (auto transferPair : transferList) + { + Transfer* transfer = transferPair.second; + transfer->ValidateDst(ev); + totalBytesTransferred += transfer->numBytesActual; + } + + // Report timings + totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000; + double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime; + double maxGpuTime = 0; + + if (!isSrcCorrect) goto cleanup; + if (ev.useSingleStream) + { + for (auto& exeInfoPair : transferMap) + { + ExecutorInfo exeInfo = exeInfoPair.second; + ExeType const exeType = exeInfoPair.first.first; + int const exeIndex = exeInfoPair.first.second; + + // Compute total time for non GPU executors + if (exeType != EXE_GPU_GFX) + { + exeInfo.totalTime = 0; + for (auto const& transfer : exeInfo.transfers) + exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime); + } + + double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations); + double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f; + maxGpuTime = std::max(maxGpuTime, exeDurationMsec); + + if (verbose && !ev.outputToCsv) + { + printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n", + ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes); + } + + int totalCUs = 0; + for (auto const& transfer : exeInfo.transfers) + { + double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations); + double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f; + totalCUs += transfer->numSubExecs; + + if (!verbose) continue; + if (!ev.outputToCsv) + { + printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", + transfer->transferIndex, + transferBandwidthGbs, + transferDurationMsec, + transfer->numBytesActual, + transfer->SrcToStr().c_str(), + ExeTypeName[transfer->exeType], transfer->exeIndex, + transfer->numSubExecs, + transfer->DstToStr().c_str()); + } + else + { + printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n", + testNum, transfer->transferIndex, transfer->numBytesActual, + transfer->SrcToStr().c_str(), + MemTypeStr[transfer->exeType], transfer->exeIndex, + transfer->DstToStr().c_str(), + transfer->numSubExecs, + transferBandwidthGbs, transferDurationMsec, + PtrVectorToStr(transfer->srcMem, initOffset).c_str(), + PtrVectorToStr(transfer->dstMem, initOffset).c_str()); + } + } + + if (verbose && ev.outputToCsv) + { + printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n", + testNum, totalBytesTransferred, + MemTypeStr[exeType], exeIndex, totalCUs, + exeBandwidthGbs, exeDurationMsec); + } + } + } + else + { + for (auto const& transferPair : transferList) + { + Transfer* transfer = transferPair.second; + double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations); + double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f; + maxGpuTime = std::max(maxGpuTime, transferDurationMsec); + if (!verbose) continue; + if (!ev.outputToCsv) + { + printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", + transfer->transferIndex, + transferBandwidthGbs, transferDurationMsec, + transfer->numBytesActual, + transfer->SrcToStr().c_str(), + ExeTypeName[transfer->exeType], transfer->exeIndex, + transfer->numSubExecs, + transfer->DstToStr().c_str()); + } + else + { + printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n", + testNum, transfer->transferIndex, transfer->numBytesActual, + transfer->SrcToStr().c_str(), + ExeTypeName[transfer->exeType], transfer->exeIndex, + transfer->DstToStr().c_str(), + transfer->numSubExecs, + transferBandwidthGbs, transferDurationMsec, + PtrVectorToStr(transfer->srcMem, initOffset).c_str(), + PtrVectorToStr(transfer->dstMem, initOffset).c_str()); + } + } + } + + // Display aggregate statistics + if (verbose) + { + if (!ev.outputToCsv) + { + printf(" Aggregate (CPU) | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n", + totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime); + } + else + { + printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n", + testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime); + } + } + + // Release GPU memory +cleanup: + for (auto exeInfoPair : transferMap) + { + ExecutorInfo& exeInfo = exeInfoPair.second; + ExeType const exeType = exeInfoPair.first.first; + int const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType)); + + for (auto& transfer : exeInfo.transfers) + { + for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc) + { + MemType const& srcType = transfer->srcType[iSrc]; + DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset); + } + for (int iDst = 0; iDst < transfer->numDsts; ++iDst) + { + MemType const& dstType = transfer->dstType[iDst]; + DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset); + } + transfer->subExecParam.clear(); + } + + if (IsGpuType(exeType)) + { + int const numStreams = (int)exeInfo.streams.size(); + for (int i = 0; i < numStreams; ++i) + { + HIP_CALL(hipEventDestroy(exeInfo.startEvents[i])); + HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i])); + HIP_CALL(hipStreamDestroy(exeInfo.streams[i])); + } + + if (exeType == EXE_GPU_GFX) + { + DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu); + } + } + } +} + +void DisplayUsage(char const* cmdName) +{ + printf("TransferBench v%s\n", TB_VERSION); + printf("========================================\n"); + + if (numa_available() == -1) + { + printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n"); + exit(1); + } + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + int const numCpuDevices = numa_num_configured_nodes(); + + printf("Usage: %s config \n", cmdName); + printf(" config: Either:\n"); + printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n"); + printf(" - Name of preset config:\n"); + printf(" p2p - Peer-to-peer benchmark tests\n"); + printf(" sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n"); + printf(" - 3rd/4th optional args for # GPU SubExecs / # CPU SubExecs per Transfer\n"); + printf(" N : (Optional) Number of bytes to copy per Transfer.\n"); + printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", + DEFAULT_BYTES_PER_TRANSFER); + printf(" If 0 is specified, a range of Ns will be benchmarked\n"); + printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n"); + printf("\n"); + + EnvVars::DisplayUsage(); +} + +int RemappedIndex(int const origIdx, bool const isCpuType) +{ + static std::vector remappingCpu; + static std::vector remappingGpu; + + // Build CPU remapping on first use + // Skip numa nodes that are not configured + if (remappingCpu.empty()) + { + for (int node = 0; node <= numa_max_node(); node++) + if (numa_bitmask_isbitset(numa_get_mems_allowed(), node)) + remappingCpu.push_back(node); + } + + // Build remappingGpu on first use + if (remappingGpu.empty()) + { + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + remappingGpu.resize(numGpuDevices); + + int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0; + if (!usePcieIndexing) + { + // For HIP-based indexing no remappingGpu is necessary + for (int i = 0; i < numGpuDevices; ++i) + remappingGpu[i] = i; + } + else + { + // Collect PCIe address for each GPU + std::vector> mapping; + char pciBusId[20]; + for (int i = 0; i < numGpuDevices; ++i) + { + HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i)); + mapping.push_back(std::make_pair(pciBusId, i)); + } + // Sort GPUs by PCIe address then use that as mapping + std::sort(mapping.begin(), mapping.end()); + for (int i = 0; i < numGpuDevices; ++i) + remappingGpu[i] = mapping[i].second; + } + } + return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx]; +} + +void DisplayTopology(bool const outputToCsv) +{ + + int numCpuDevices = numa_num_configured_nodes(); + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + + if (outputToCsv) + { + printf("NumCpus,%d\n", numCpuDevices); + printf("NumGpus,%d\n", numGpuDevices); + } + else + { + printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total] %d GPU device(s)\n", + numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices); + } + + // Print out detected CPU topology + if (outputToCsv) + { + printf("NUMA"); + for (int j = 0; j < numCpuDevices; j++) + printf(",NUMA%02d", j); + printf(",# CPUs,ClosestGPUs,ActualNode\n"); + } + else + { + printf(" |"); + for (int j = 0; j < numCpuDevices; j++) + printf("NUMA %02d|", j); + printf(" #Cpus | Closest GPU(s)\n"); + + printf("------------+"); + for (int j = 0; j <= numCpuDevices; j++) + printf("-------+"); + printf("---------------\n"); + } + + for (int i = 0; i < numCpuDevices; i++) + { + int nodeI = RemappedIndex(i, true); + printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|"); + for (int j = 0; j < numCpuDevices; j++) + { + int nodeJ = RemappedIndex(j, true); + int numaDist = numa_distance(nodeI, nodeJ); + if (outputToCsv) + printf("%d,", numaDist); + else + printf(" %5d |", numaDist); + } + + int numCpus = 0; + for (int j = 0; j < numa_num_configured_cpus(); j++) + if (numa_node_of_cpu(j) == nodeI) numCpus++; + if (outputToCsv) + printf("%d,", numCpus); + else + printf(" %5d | ", numCpus); + +#if !defined(__NVCC__) + bool isFirst = true; + for (int j = 0; j < numGpuDevices; j++) + { + if (GetClosestNumaNode(RemappedIndex(j, false)) == i) + { + if (isFirst) isFirst = false; + else printf(","); + printf("%d", j); + } + } +#endif + printf("\n"); + } + printf("\n"); + +#if defined(__NVCC__) + // No further topology detection done for NVIDIA platforms + return; +#endif + + // Print out detected GPU topology + if (outputToCsv) + { + printf("GPU"); + for (int j = 0; j < numGpuDevices; j++) + printf(",GPU %02d", j); + printf(",PCIe Bus ID,ClosestNUMA\n"); + } + else + { + printf(" |"); + for (int j = 0; j < numGpuDevices; j++) + { + hipDeviceProp_t prop; + HIP_CALL(hipGetDeviceProperties(&prop, j)); + std::string fullName = prop.gcnArchName; + std::string archName = fullName.substr(0, fullName.find(':')); + printf(" %6s |", archName.c_str()); + } + printf("\n"); + printf(" |"); + for (int j = 0; j < numGpuDevices; j++) + printf(" GPU %02d |", j); + printf(" PCIe Bus ID | #CUs | Closest NUMA\n"); + for (int j = 0; j <= numGpuDevices; j++) + printf("--------+"); + printf("--------------+------+-------------\n"); + } + +#if !defined(__NVCC__) + char pciBusId[20]; + for (int i = 0; i < numGpuDevices; i++) + { + int const deviceIdx = RemappedIndex(i, false); + printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |"); + for (int j = 0; j < numGpuDevices; j++) + { + if (i == j) + { + if (outputToCsv) + printf("-,"); + else + printf(" - |"); + } + else + { + uint32_t linkType, hopCount; + HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx, + RemappedIndex(j, false), + &linkType, &hopCount)); + printf("%s%s-%d%s", + outputToCsv ? "" : " ", + linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT" : + linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI" : + linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE" : + linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFB" : + linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI" : "????", + hopCount, outputToCsv ? "," : " |"); + } + } + HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx)); + + int numDeviceCUs = 0; + HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx)); + + if (outputToCsv) + printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx)); + else + printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx)); + } +#endif +} + +void ParseMemType(std::string const& token, int const numCpus, int const numGpus, + std::vector& memTypes, std::vector& memIndices) +{ + char typeChar; + int offset = 0, devIndex, inc; + bool found = false; + + memTypes.clear(); + memIndices.clear(); + while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2) + { + offset += inc; + MemType memType = CharToMemType(typeChar); + + if (IsCpuType(memType) && (devIndex < 0 || devIndex >= numCpus)) + { + printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, devIndex); + exit(1); + } + if (IsGpuType(memType) && (devIndex < 0 || devIndex >= numGpus)) + { + printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, devIndex); + exit(1); + } + + found = true; + if (memType != MEM_NULL) + { + memTypes.push_back(memType); + memIndices.push_back(devIndex); + } + } + if (!found) + { + printf("[ERROR] Unable to parse memory type token %s. Expected one of %s followed by an index\n", + token.c_str(), MemTypeStr); + exit(1); + } +} + +void ParseExeType(std::string const& token, int const numCpus, int const numGpus, + ExeType &exeType, int& exeIndex) +{ + char typeChar; + if (sscanf(token.c_str(), " %c%d", &typeChar, &exeIndex) != 2) + { + printf("[ERROR] Unable to parse valid executor token (%s). Exepected one of %s followed by an index\n", + token.c_str(), ExeTypeStr); + exit(1); + } + exeType = CharToExeType(typeChar); + + if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= numCpus)) + { + printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, exeIndex); + exit(1); + } + if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= numGpus)) + { + printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, exeIndex); + exit(1); + } +} + +// Helper function to parse a list of Transfer definitions +void ParseTransfers(char* line, int numCpus, int numGpus, std::vector& transfers) +{ + // Replace any round brackets or '->' with spaces, + for (int i = 1; line[i]; i++) + if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' '; + + transfers.clear(); + + int numTransfers = 0; + std::istringstream iss(line); + iss >> numTransfers; + if (iss.fail()) return; + + std::string exeMem; + std::string srcMem; + std::string dstMem; + + // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes) + // otherwise read triples (srcMem, exeMem, dstMem) + bool const advancedMode = (numTransfers < 0); + numTransfers = abs(numTransfers); + + int numSubExecs; + if (!advancedMode) + { + iss >> numSubExecs; + if (numSubExecs <= 0 || iss.fail()) + { + printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs); + exit(1); + } + } + + size_t numBytes = 0; + for (int i = 0; i < numTransfers; i++) + { + Transfer transfer; + transfer.transferIndex = i; + transfer.numBytes = 0; + transfer.numBytesActual = 0; + if (!advancedMode) + { + iss >> srcMem >> exeMem >> dstMem; + if (iss.fail()) + { + printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1); + exit(1); + } + } + else + { + std::string numBytesToken; + iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken; + if (iss.fail()) + { + printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1); + exit(1); + } + if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1) + { + printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1); + exit(1); + } + char units = numBytesToken.back(); + switch (toupper(units)) + { + case 'K': numBytes *= 1024; break; + case 'M': numBytes *= 1024*1024; break; + case 'G': numBytes *= 1024*1024*1024; break; + } + } + + ParseMemType(srcMem, numCpus, numGpus, transfer.srcType, transfer.srcIndex); + ParseMemType(dstMem, numCpus, numGpus, transfer.dstType, transfer.dstIndex); + ParseExeType(exeMem, numCpus, numGpus, transfer.exeType, transfer.exeIndex); + + transfer.numSrcs = (int)transfer.srcType.size(); + transfer.numDsts = (int)transfer.dstType.size(); + if (transfer.numSrcs == 0 && transfer.numDsts == 0) + { + printf("[ERROR] Transfer must have at least one src or dst\n"); + exit(1); + } + + if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1)) + { + printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n"); + exit(1); + } + + transfer.numSubExecs = numSubExecs; + transfer.numBytes = numBytes; + transfers.push_back(transfer); + } +} + +void EnablePeerAccess(int const deviceId, int const peerDeviceId) +{ + int canAccess; + HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId)); + if (!canAccess) + { + printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId); + exit(1); + } + HIP_CALL(hipSetDevice(deviceId)); + hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0); + if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled) + { + printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n", + deviceId, peerDeviceId, hipGetErrorString(error)); + exit(1); + } +} + +void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr) +{ + if (numBytes == 0) + { + printf("[ERROR] Unable to allocate 0 bytes\n"); + exit(1); + } + *memPtr = nullptr; + if (IsCpuType(memType)) + { + // Set numa policy prior to call to hipHostMalloc + numa_set_preferred(devIndex); + + // Allocate host-pinned memory (should respect NUMA mem policy) + if (memType == MEM_CPU_FINE) + { +#if defined (__NVCC__) + printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n"); + exit(1); +#else + HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser)); +#endif + } + else if (memType == MEM_CPU) + { +#if defined (__NVCC__) + if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess) +#else + if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess) +#endif + { + printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex); + exit(1); + } + } + else if (memType == MEM_CPU_UNPINNED) + { + *memPtr = numa_alloc_onnode(numBytes, devIndex); + } + + // Check that the allocated pages are actually on the correct NUMA node + memset(*memPtr, 0, numBytes); + CheckPages((char*)*memPtr, numBytes, devIndex); + + // Reset to default numa mem policy + numa_set_preferred(-1); + } + else if (IsGpuType(memType)) + { + if (memType == MEM_GPU) + { + // Allocate GPU memory on appropriate device + HIP_CALL(hipSetDevice(devIndex)); + HIP_CALL(hipMalloc((void**)memPtr, numBytes)); + } + else if (memType == MEM_GPU_FINE) + { +#if defined (__NVCC__) + printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n"); + exit(1); +#else + HIP_CALL(hipSetDevice(devIndex)); + HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained)); + +#endif + } + HIP_CALL(hipMemset(*memPtr, 0, numBytes)); + HIP_CALL(hipDeviceSynchronize()); + } + else + { + printf("[ERROR] Unsupported memory type %d\n", memType); + exit(1); + } +} + +void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes) +{ + if (memType == MEM_CPU || memType == MEM_CPU_FINE) + { + if (memPtr == nullptr) + { + printf("[ERROR] Attempting to free null CPU pointer for %lu bytes. Skipping hipHostFree\n", bytes); + return; + } + HIP_CALL(hipHostFree(memPtr)); + } + else if (memType == MEM_CPU_UNPINNED) + { + if (memPtr == nullptr) + { + printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes. Skipping numa_free\n", bytes); + return; + } + numa_free(memPtr, bytes); + } + else if (memType == MEM_GPU || memType == MEM_GPU_FINE) + { + if (memPtr == nullptr) + { + printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes); + return; + } + HIP_CALL(hipFree(memPtr)); + } +} + +void CheckPages(char* array, size_t numBytes, int targetId) +{ + unsigned long const pageSize = getpagesize(); + unsigned long const numPages = (numBytes + pageSize - 1) / pageSize; + + std::vector pages(numPages); + std::vector status(numPages); + + pages[0] = array; + for (int i = 1; i < numPages; i++) + { + pages[i] = (char*)pages[i-1] + pageSize; + } + + long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0); + if (retCode) + { + printf("[ERROR] Unable to collect page info\n"); + exit(1); + } + + size_t mistakeCount = 0; + for (int i = 0; i < numPages; i++) + { + if (status[i] < 0) + { + printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i); + exit(1); + } + if (status[i] != targetId) mistakeCount++; + } + if (mistakeCount > 0) + { + printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId); + exit(1); + } +} + +void RunTransfer(EnvVars const& ev, int const iteration, + ExecutorInfo& exeInfo, int const transferIdx) +{ + Transfer* transfer = exeInfo.transfers[transferIdx]; + + if (transfer->exeType == EXE_GPU_GFX) + { + // Switch to executing GPU + int const exeIndex = RemappedIndex(transfer->exeIndex, false); + HIP_CALL(hipSetDevice(exeIndex)); + + hipStream_t& stream = exeInfo.streams[transferIdx]; + hipEvent_t& startEvent = exeInfo.startEvents[transferIdx]; + hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx]; + + // Figure out how many threadblocks to use. + // In single stream mode, all the threadblocks for this GPU are launched + // Otherwise, just launch the threadblocks associated with this single Transfer + int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs; +#if defined(__NVCC__) + HIP_CALL(hipEventRecord(startEvent, stream)); + GpuKernelTable[ev.gpuKernel]<<>>(transfer->subExecParamGpuPtr); + HIP_CALL(hipEventRecord(stopEvent, stream)); +#else + hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel], + dim3(numBlocksToRun, 1, 1), + dim3(BLOCKSIZE, 1, 1), + ev.sharedMemBytes, stream, + startEvent, stopEvent, + 0, transfer->subExecParamGpuPtr); +#endif + // Synchronize per iteration, unless in single sync mode, in which case + // synchronize during last warmup / last actual iteration + HIP_CALL(hipStreamSynchronize(stream)); + + if (iteration >= 0) + { + // Record GPU timing + float gpuDeltaMsec; + HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); + + if (ev.useSingleStream) + { + // Figure out individual timings for Transfers that were all launched together + for (Transfer* currTransfer : exeInfo.transfers) + { + long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle; + long long maxStopCycle = currTransfer->subExecParamGpuPtr[0].stopCycle; + for (int i = 1; i < currTransfer->numSubExecs; i++) + { + minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle); + maxStopCycle = std::max(maxStopCycle, currTransfer->subExecParamGpuPtr[i].stopCycle); + } + int const wallClockRate = GetWallClockRate(exeIndex); + double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate); + currTransfer->transferTime += iterationTimeMs; + } + exeInfo.totalTime += gpuDeltaMsec; + } + else + { + transfer->transferTime += gpuDeltaMsec; + } + } + } + else if (transfer->exeType == EXE_GPU_DMA) + { + // Switch to executing GPU + int const exeIndex = RemappedIndex(transfer->exeIndex, false); + HIP_CALL(hipSetDevice(exeIndex)); + + hipStream_t& stream = exeInfo.streams[transferIdx]; + hipEvent_t& startEvent = exeInfo.startEvents[transferIdx]; + hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx]; + + HIP_CALL(hipEventRecord(startEvent, stream)); + if (transfer->numSrcs == 0 && transfer->numDsts == 1) + { + HIP_CALL(hipMemsetAsync(transfer->dstMem[0], + MEMSET_CHAR, transfer->numBytesActual, stream)); + } + else if (transfer->numSrcs == 1 && transfer->numDsts == 1) + { + HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0], + transfer->numBytesActual, hipMemcpyDefault, + stream)); + } + HIP_CALL(hipEventRecord(stopEvent, stream)); + HIP_CALL(hipStreamSynchronize(stream)); + + if (iteration >= 0) + { + // Record GPU timing + float gpuDeltaMsec; + HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); + transfer->transferTime += gpuDeltaMsec; + } + } + else if (transfer->exeType == EXE_CPU) // CPU execution agent + { + // Force this thread and all child threads onto correct NUMA node + int const exeIndex = RemappedIndex(transfer->exeIndex, true); + if (numa_run_on_node(exeIndex)) + { + printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex); + exit(1); + } + + std::vector childThreads; + + auto cpuStart = std::chrono::high_resolution_clock::now(); + + // Launch each subExecutor in child-threads to perform memcopies + for (int i = 0; i < transfer->numSubExecs; ++i) + childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i]))); + + // Wait for child-threads to finish + for (int i = 0; i < transfer->numSubExecs; ++i) + childThreads[i].join(); + + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + + // Record time if not a warmup iteration + if (iteration >= 0) + transfer->transferTime += (std::chrono::duration_cast>(cpuDelta).count() * 1000.0); + } +} + +void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) +{ + ev.DisplayP2PBenchmarkEnvVars(); + + // Collect the number of available CPUs/GPUs on this machine + int const numCpus = ev.numCpuDevices; + int const numGpus = ev.numGpuDevices; + int const numDevices = numCpus + numGpus; + + // Enable peer to peer for each GPU + for (int i = 0; i < numGpus; i++) + for (int j = 0; j < numGpus; j++) + if (i != j) EnablePeerAccess(i, j); + + // Perform unidirectional / bidirectional + for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) + { + // Print header + if (!ev.outputToCsv) + { + printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni", + ev.useRemoteRead ? "Remote" : "Local", + ev.useRemoteRead ? "Local" : "Remote", + ev.useDmaCopy ? "DMA" : "GFX"); + + if (isBidirectional) + { + printf("%12s", "SRC\\DST"); + } + else + { + if (ev.useRemoteRead) + printf("%12s", "SRC\\EXE+DST"); + else + printf("%12s", "SRC+EXE\\DST"); + } + for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i); + for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i); + printf("\n"); + } + + // Loop over all possible src/dst pairs + for (int src = 0; src < numDevices; src++) + { + MemType const srcType = (src < numCpus ? MEM_CPU : MEM_GPU); + int const srcIndex = (srcType == MEM_CPU ? src : src - numCpus); + + if (!ev.outputToCsv) + printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex); + + for (int dst = 0; dst < numDevices; dst++) + { + MemType const dstType = (dst < numCpus ? MEM_CPU : MEM_GPU); + int const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus); + + double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex); + if (!ev.outputToCsv) + { + if (bandwidth == 0) + printf("%10s", "N/A"); + else + printf("%10.2f", bandwidth); + } + else + { + printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n", + srcType == MEM_CPU ? "CPU" : "GPU", srcIndex, + dstType == MEM_CPU ? "CPU" : "GPU", dstIndex, + isBidirectional ? "bidirectional" : "unidirectional", + ev.useRemoteRead ? "Remote" : "Local", + ev.useDmaCopy ? "DMA" : "GFX", + bandwidth, + N * sizeof(float)); + } + fflush(stdout); + } + if (!ev.outputToCsv) printf("\n"); + } + if (!ev.outputToCsv) printf("\n"); + } +} + +double GetPeakBandwidth(EnvVars const& ev, size_t const N, + int const isBidirectional, + MemType const srcType, int const srcIndex, + MemType const dstType, int const dstIndex) +{ + // Skip bidirectional on same device + if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f; + + // Prepare Transfers + std::vector transfers(2); + transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float); + + // SRC -> DST + transfers[0].numSrcs = transfers[0].numDsts = 1; + transfers[0].srcType.push_back(srcType); + transfers[0].dstType.push_back(dstType); + transfers[0].srcIndex.push_back(srcIndex); + transfers[0].dstIndex.push_back(dstIndex); + + // DST -> SRC + transfers[1].numSrcs = transfers[1].numDsts = 1; + transfers[1].srcType.push_back(dstType); + transfers[1].dstType.push_back(srcType); + transfers[1].srcIndex.push_back(dstIndex); + transfers[1].dstIndex.push_back(srcIndex); + + // Either perform (local read + remote write), or (remote read + local write) + ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX; + transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU; + transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU; + transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex); + transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex); + transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs; + transfers[1].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs; + + // Remove (DST->SRC) if not bidirectional + transfers.resize(isBidirectional + 1); + + // Abort if executing on NUMA node with no CPUs + for (int i = 0; i <= isBidirectional; i++) + { + if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0) + return 0; + +#if defined(__NVCC__) + // NVIDIA platform cannot access GPU memory directly from CPU executors + if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType))) + return 0; +#endif + } + + ExecuteTransfers(ev, 0, N, transfers, false); + + // Collect aggregate bandwidth + double totalBandwidth = 0; + for (int i = 0; i <= isBidirectional; i++) + { + double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations); + double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f; + totalBandwidth += transferBandwidthGbs; + } + return totalBandwidth; +} + +void Transfer::PrepareSubExecParams(EnvVars const& ev) +{ + // Each subExecutor needs to know src/dst pointers and how many elements to transfer + // Figure out the sub-array each subExecutor works on for this Transfer + // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes, + // except the very last one, for alignment reasons + size_t const N = this->numBytesActual / sizeof(float); + int const initOffset = ev.byteOffset / sizeof(float); + int const targetMultiple = ev.blockBytes / sizeof(float); + + // In some cases, there may not be enough data for all subExectors + int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs); + this->subExecParam.clear(); + this->subExecParam.resize(this->numSubExecs); + + size_t assigned = 0; + for (int i = 0; i < this->numSubExecs; ++i) + { + int const subExecLeft = std::max(0, maxSubExecToUse - i); + size_t const leftover = N - assigned; + size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple; + + SubExecParam& p = this->subExecParam[i]; + p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0; + p.numSrcs = this->numSrcs; + p.numDsts = this->numDsts; + for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) + p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset; + for (int iDst = 0; iDst < this->numDsts; ++iDst) + p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset; + + if (ev.enableDebug) + { + printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n", + this->transferIndex, i, p.N, assigned, assigned + p.N); + } + + p.startCycle = 0; + p.stopCycle = 0; + assigned += p.N; + } + + this->transferTime = 0.0; +} + +void Transfer::PrepareReference(EnvVars const& ev, std::vector& buffer, int bufferIdx) +{ + size_t N = buffer.size(); + if (bufferIdx >= 0) + { + size_t patternLen = ev.fillPattern.size(); + if (patternLen > 0) + { + for (size_t i = 0; i < N; ++i) + buffer[i] = ev.fillPattern[i % patternLen]; + } + else + { + for (size_t i = 0; i < N; ++i) + buffer[i] = PrepSrcValue(bufferIdx, i); + } + } + else // Destination buffer + { + if (this->numSrcs == 0) + { + // Note: 0x75757575 = 13323083.0 + memset(buffer.data(), MEMSET_CHAR, N * sizeof(float)); + } + else + { + PrepareReference(ev, buffer, 0); + + if (this->numSrcs > 1) + { + std::vector temp(N); + for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx) + { + PrepareReference(ev, temp, srcIdx); + for (int i = 0; i < N; ++i) + { + buffer[i] += temp[i]; + } + } + } + } + } +} + +bool Transfer::PrepareSrc(EnvVars const& ev) +{ + if (this->numSrcs == 0) return true; + size_t const N = this->numBytesActual / sizeof(float); + int const initOffset = ev.byteOffset / sizeof(float); + + std::vector reference(N); + for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx) + { + float* srcPtr = this->srcMem[srcIdx] + initOffset; + PrepareReference(ev, reference, srcIdx); + + // Initialize source memory array with reference pattern + if (IsGpuType(this->srcType[srcIdx])) + { + int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false); + HIP_CALL(hipSetDevice(deviceIdx)); + if (ev.usePrepSrcKernel) + PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx); + else + HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault)); + HIP_CALL(hipDeviceSynchronize()); + } + else if (IsCpuType(this->srcType[srcIdx])) + { + memcpy(srcPtr, reference.data(), this->numBytesActual); + } + + // Perform check just to make sure that data has been copied properly + float* srcCheckPtr = srcPtr; + std::vector srcCopy(N); + if (IsGpuType(this->srcType[srcIdx])) + { + if (!ev.validateDirect) + { + HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault)); + HIP_CALL(hipDeviceSynchronize()); + srcCheckPtr = srcCopy.data(); + } + } + + for (size_t i = 0; i < N; ++i) + { + if (reference[i] != srcCheckPtr[i]) + { + printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx); +#if !defined(__NVCC__) + float const val = this->srcMem[srcIdx][initOffset + i]; + printf("[ERROR] SRC %02d value: %10.5f [%08X] Direct: %10.5f [%08X]\n", + srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val); +#else + printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]); +#endif + printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]); + printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n", + this->transferIndex, + this->SrcToStr().c_str(), + ExeTypeStr[this->exeType], this->exeIndex, + this->numSubExecs, + this->DstToStr().c_str()); + if (!ev.continueOnError) + exit(1); + return false; + } + } + } + return true; +} + +void Transfer::ValidateDst(EnvVars const& ev) +{ + if (this->numDsts == 0) return; + size_t const N = this->numBytesActual / sizeof(float); + int const initOffset = ev.byteOffset / sizeof(float); + + std::vector reference(N); + PrepareReference(ev, reference, -1); + + std::vector hostBuffer(N); + for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx) + { + float* output; + if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect) + { + output = this->dstMem[dstIdx] + initOffset; + } + else + { + int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false); + HIP_CALL(hipSetDevice(deviceIdx)); + HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault)); + HIP_CALL(hipDeviceSynchronize()); + output = hostBuffer.data(); + } + + for (size_t i = 0; i < N; ++i) + { + if (reference[i] != output[i]) + { + printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx); + for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx) + { + float srcVal; + HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault)); +#if !defined(__NVCC__) + float val = this->srcMem[srcIdx][initOffset + i]; + printf("[ERROR] SRC %02dD value: %10.5f [%08X] Direct: %10.5f [%08X]\n", + srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val); +#else + printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal); +#endif + } + printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]); +#if !defined(__NVCC__) + float dstVal = this->dstMem[dstIdx][initOffset + i]; + printf("[ERROR] DST %02d value: %10.5f [%08X] Direct: %10.5f [%08X]\n", + dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal); +#else + printf("[ERROR] DST %02d value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]); +#endif + printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n", + this->transferIndex, + this->SrcToStr().c_str(), + ExeTypeStr[this->exeType], this->exeIndex, + this->numSubExecs, + this->DstToStr().c_str()); + if (!ev.continueOnError) + exit(1); + else + break; + } + } + } +} + +std::string Transfer::SrcToStr() const +{ + if (numSrcs == 0) return "N"; + std::stringstream ss; + for (int i = 0; i < numSrcs; ++i) + ss << MemTypeStr[srcType[i]] << srcIndex[i]; + return ss.str(); +} + +std::string Transfer::DstToStr() const +{ + if (numDsts == 0) return "N"; + std::stringstream ss; + for (int i = 0; i < numDsts; ++i) + ss << MemTypeStr[dstType[i]] << dstIndex[i]; + return ss.str(); +} + +// NOTE: This is a stop-gap solution until HIP provides wallclock values +int GetWallClockRate(int deviceId) +{ + static std::vector wallClockPerDeviceMhz; + + if (wallClockPerDeviceMhz.size() == 0) + { + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + wallClockPerDeviceMhz.resize(numGpuDevices); + + for (int i = 0; i < numGpuDevices; i++) + { +#if defined(__NVCC__) + int value = 1410000; + //HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i)); + //value *= 1000; +#else + hipDeviceProp_t prop; + HIP_CALL(hipGetDeviceProperties(&prop, i)); + int value = 25000; + switch (prop.gcnArch) + { + case 906: case 910: value = 25000; break; + default: + printf("Unrecognized GCN arch %d\n", prop.gcnArch); + } +#endif + wallClockPerDeviceMhz[i] = value; + } + } + return wallClockPerDeviceMhz[deviceId]; +} + +void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom) +{ + ev.DisplaySweepEnvVars(); + + // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets) + std::vector> exeList; + for (auto exe : ev.sweepExe) + { + ExeType const exeType = CharToExeType(exe); + if (IsGpuType(exeType)) + { + for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex) + exeList.push_back(std::make_pair(exeType, exeIndex)); + } + else if (IsCpuType(exeType)) + { + for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex) + { + // Skip NUMA nodes that have no CPUs (e.g. CXL) + if (ev.numCpusPerNuma[exeIndex] == 0) continue; + exeList.push_back(std::make_pair(exeType, exeIndex)); + } + } + } + int numExes = exeList.size(); + + std::vector> srcList; + for (auto src : ev.sweepSrc) + { + MemType const srcType = CharToMemType(src); + int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices; + + for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex) + srcList.push_back(std::make_pair(srcType, srcIndex)); + } + int numSrcs = srcList.size(); + + + std::vector> dstList; + for (auto dst : ev.sweepDst) + { + MemType const dstType = CharToMemType(dst); + int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices; + + for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex) + dstList.push_back(std::make_pair(dstType, dstIndex)); + } + int numDsts = dstList.size(); + + // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count) + struct TransferInfo + { + MemType srcType; int srcIndex; + ExeType exeType; int exeIndex; + MemType dstType; int dstIndex; + }; + + // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI + bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0); + + std::vector possibleTransfers; + TransferInfo tinfo; + for (int i = 0; i < numExes; ++i) + { + // Skip CPU executors if XGMI link must be used + if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue; + tinfo.exeType = exeList[i].first; + tinfo.exeIndex = exeList[i].second; + + bool isXgmiSrc = false; + int numHopsSrc = 0; + for (int j = 0; j < numSrcs; ++j) + { + if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first)) + { + if (exeList[i].second != srcList[j].second) + { +#if defined(__NVCC__) + isXgmiSrc = false; +#else + uint32_t exeToSrcLinkType, exeToSrcHopCount; + HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false), + RemappedIndex(srcList[j].second, false), + &exeToSrcLinkType, + &exeToSrcHopCount)); + isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI); + if (isXgmiSrc) numHopsSrc = exeToSrcHopCount; +#endif + } + else + { + isXgmiSrc = true; + numHopsSrc = 0; + } + + // Skip this SRC if it is not XGMI but only XGMI links may be used + if (useXgmiOnly && !isXgmiSrc) continue; + + // Skip this SRC if XGMI distance is already past limit + if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue; + } + else if (useXgmiOnly) continue; + + tinfo.srcType = srcList[j].first; + tinfo.srcIndex = srcList[j].second; + + bool isXgmiDst = false; + int numHopsDst = 0; + for (int k = 0; k < numDsts; ++k) + { + if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first)) + { + if (exeList[i].second != dstList[k].second) + { +#if defined(__NVCC__) + isXgmiSrc = false; +#else + uint32_t exeToDstLinkType, exeToDstHopCount; + HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false), + RemappedIndex(dstList[k].second, false), + &exeToDstLinkType, + &exeToDstHopCount)); + isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI); + if (isXgmiDst) numHopsDst = exeToDstHopCount; +#endif + } + else + { + isXgmiDst = true; + numHopsDst = 0; + } + } + + // Skip this DST if it is not XGMI but only XGMI links may be used + if (useXgmiOnly && !isXgmiDst) continue; + + // Skip this DST if total XGMI distance (SRC + DST) is less than min limit + if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue; + + // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit + if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue; + +#if defined(__NVCC__) + // Skip CPU executors on GPU memory on NVIDIA platform + if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first))) + continue; +#endif + + tinfo.dstType = dstList[k].first; + tinfo.dstIndex = dstList[k].second; + + possibleTransfers.push_back(tinfo); + } + } + } + + int const numPossible = (int)possibleTransfers.size(); + int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax); + + if (ev.sweepMin > numPossible) + { + printf("No valid test configurations exist\n"); + return; + } + + if (ev.outputToCsv) + { + printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms)," + "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n"); + } + + int numTestsRun = 0; + int M = ev.sweepMin; + std::uniform_int_distribution randSize(1, numBytesPerTransfer / sizeof(float)); + std::uniform_int_distribution distribution(ev.sweepMin, maxParallelTransfers); + + // Log sweep to configuration file + FILE *fp = fopen("lastSweep.cfg", "w"); + if (!fp) + { + printf("[ERROR] Unable to open lastSweep.cfg. Check permissions\n"); + exit(1); + } + + // Create bitmask of numPossible triplets, of which M will be chosen + std::string bitmask(M, 1); bitmask.resize(numPossible, 0); + auto cpuStart = std::chrono::high_resolution_clock::now(); + while (1) + { + if (isRandom) + { + // Pick random number of simultaneous transfers to execute + // NOTE: This currently skews distribution due to some #s having more possibilities than others + M = distribution(*ev.generator); + + // Generate a random bitmask + for (int i = 0; i < numPossible; i++) + bitmask[i] = (i < M) ? 1 : 0; + std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator); + } + + // Convert bitmask to list of Transfers + std::vector transfers; + for (int value = 0; value < numPossible; ++value) + { + if (bitmask[value]) + { + // Convert integer value to (SRC->EXE->DST) triplet + Transfer transfer; + transfer.numSrcs = 1; + transfer.numDsts = 1; + transfer.srcType = {possibleTransfers[value].srcType}; + transfer.srcIndex = {possibleTransfers[value].srcIndex}; + transfer.exeType = possibleTransfers[value].exeType; + transfer.exeIndex = possibleTransfers[value].exeIndex; + transfer.dstType = {possibleTransfers[value].dstType}; + transfer.dstIndex = {possibleTransfers[value].dstIndex}; + transfer.numSubExecs = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs; + transfer.transferIndex = transfers.size(); + transfer.numBytes = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0; + transfers.push_back(transfer); + } + } + + LogTransfers(fp, ++numTestsRun, transfers); + ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers); + + // Check for test limit + if (numTestsRun == ev.sweepTestLimit) + { + printf("Test limit reached\n"); + break; + } + + // Check for time limit + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + double totalCpuTime = std::chrono::duration_cast>(cpuDelta).count(); + if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit) + { + printf("Time limit exceeded\n"); + break; + } + + // Increment bitmask if not random sweep + if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end())) + { + M++; + // Check for completion + if (M > maxParallelTransfers) + { + printf("Sweep complete\n"); + break; + } + for (int i = 0; i < numPossible; i++) + bitmask[i] = (i < M) ? 1 : 0; + } + } + fclose(fp); +} + +void LogTransfers(FILE *fp, int const testNum, std::vector const& transfers) +{ + fprintf(fp, "# Test %d\n", testNum); + fprintf(fp, "%d", -1 * (int)transfers.size()); + for (auto const& transfer : transfers) + { + fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)", + MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0], + ExeTypeStr[transfer.exeType], transfer.exeIndex, + MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0], + transfer.numSubExecs, + transfer.numBytes); + } + fprintf(fp, "\n"); + fflush(fp); +} + +std::string PtrVectorToStr(std::vector const& strVector, int const initOffset) +{ + std::stringstream ss; + for (int i = 0; i < strVector.size(); ++i) + { + if (i) ss << " "; + ss << (strVector[i] + initOffset); + } + return ss.str(); +} diff --git a/tfb.so/src/action.cpp b/tfb.so/src/action.cpp new file mode 100644 index 00000000..0d0d4931 --- /dev/null +++ b/tfb.so/src/action.cpp @@ -0,0 +1,204 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "include/action.h" + +#include +#include +#include +#include +#include +#include +#include + +#define __HIP_PLATFORM_HCC__ + +#include "include/rvs_key_def.h" +#include "include/tfb_worker.h" +#include "include/rvsactionbase.h" +#include "include/rvsloglp.h" + +using std::string; +using std::vector; +using std::map; +using std::regex; + +#define RVS_CONF_TFB_TYPE_KEY "transfer-type" +#define MODULE_NAME "transfer-bench" +#define MODULE_NAME_CAPS "TRANSFER-BENCH" +#define TFB_DEFAULT_TYPE "p2p" +#define TFB_NO_COMPATIBLE_GPUS "No AMD compatible GPU found!" + +#define FLOATING_POINT_REGEX "^[0-9]*\\.?[0-9]+$" + +#define JSON_CREATE_NODE_ERROR "JSON cannot create node" + +/** + * @brief default class constructor + */ +tfb_action::tfb_action() { + bjson = false; +} + +/** + * @brief class destructor + */ +tfb_action::~tfb_action() { + property.clear(); +} + +/** + * @brief runs the hip test session + * @return true if no error occured, false otherwise + */ +bool tfb_action::start_tfb_runners() { + size_t k = 0; + // one worker sufficient, as test runner + tfbWorker worker; + worker.set_name(action_name); + worker.set_transfer_type(m_transfer_type); + worker.start(); + worker.join(); + + return rvs::lp::Stopping() ? false : true; +} + +/** + * @brief reads all GST-related configuration keys from + * the module's properties collection + * @return true if no fatal error occured, false otherwise + */ +bool tfb_action::get_all_tfb_config_keys(void) { + int error; + string msg; + bool bsts = true; + + if (property_get(RVS_CONF_TFB_TYPE_KEY, &m_transfer_type, + TFB_DEFAULT_TYPE)) { + msg = "invalid '" + + std::string(RVS_CONF_TFB_TYPE_KEY) + "' key value"; + rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name); + bsts = false; + } + + return bsts; +} + +/** + * @brief reads all common configuration keys from + * the module's properties collection + * @return true if no fatal error occured, false otherwise + */ +bool tfb_action::get_all_common_config_keys(void) { + string msg, sdevid, sdev; + int error; + bool bsts = true; + + + // place holder for later + + return bsts; +} + +/** + * @brief gets the number of ROCm compatible AMD GPUs + * @return run number of GPUs + */ +int tfb_action::get_num_amd_gpu_devices(void) { + int hip_num_gpu_devices; + string msg; + + hipGetDeviceCount(&hip_num_gpu_devices); + if (hip_num_gpu_devices == 0) { // no AMD compatible GPU + msg = action_name + " " + MODULE_NAME + " " + TFB_NO_COMPATIBLE_GPUS; + rvs::lp::Log(msg, rvs::logerror); + + if (bjson) { + unsigned int sec; + unsigned int usec; + rvs::lp::get_ticks(&sec, &usec); + void *json_root_node = rvs::lp::LogRecordCreate(MODULE_NAME, + action_name.c_str(), rvs::loginfo, sec, usec, true); + if (!json_root_node) { + // log the error + string msg = std::string(JSON_CREATE_NODE_ERROR); + rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name); + return -1; + } + + rvs::lp::AddString(json_root_node, "ERROR", TFB_NO_COMPATIBLE_GPUS); + rvs::lp::LogRecordFlush(json_root_node, rvs::loginfo); + } + return 0; + } + return hip_num_gpu_devices; +} + +/** + * @brief gets all selected GPUs and starts the worker threads + * @return run result + */ +int tfb_action::run_transferbench(void) { + int hip_num_gpu_devices; + std::string msg; + + hip_num_gpu_devices = get_num_amd_gpu_devices(); + + if (hip_num_gpu_devices > 0) { + if (start_tfb_runners()) + return 0; + + return -1; + } else { + msg = "No devices match criteria from the test configuation."; + rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name); + return -1; + } + + return 0; +} + +/** + * @brief runs the whole GST logic + * @return run result + */ +int tfb_action::run(void) { + string msg; + + // get the action name + if (property_get(RVS_CONF_NAME_KEY, &action_name)) { + rvs::lp::Err("Action name missing", MODULE_NAME_CAPS); + return -1; + } + + // check for -j flag (json logging) + if (property.find("cli.-j") != property.end()) + bjson = true; + if (!get_all_common_config_keys()) + return -1; + if (!get_all_tfb_config_keys()) + return -1; + + return run_transferbench(); +} diff --git a/tfb.so/src/hiptest_worker.cpp b/tfb.so/src/hiptest_worker.cpp new file mode 100644 index 00000000..9a31fc32 --- /dev/null +++ b/tfb.so/src/hiptest_worker.cpp @@ -0,0 +1,96 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "include/hiptest_worker.h" + +#include +#include +#include +#include +#include "include/rvs_blas.h" +#include "include/rvs_module.h" +#include "include/rvsloglp.h" +#include "include/rvs_util.h" + +#include +#include +#include +#include + +#define MODULE_NAME "hiptest" + +using std::string; + +//bool hipTestWorker::bjson = false; + +hipTestWorker::hipTestWorker() {} +hipTestWorker::~hipTestWorker() {} + + +/** + * @brief performs the stress test on the given GPU + */ +void hipTestWorker::run() { + string msg, err_description; + int error = 0; + + + // log GST stress test - start message + msg = "[" + action_name + "] " + MODULE_NAME + " " + + " Starting the Hip test "; + rvs::lp::Log(msg, rvs::logtrace); + + // let the GPU ramp-up and check the result + bool hipsuccess = start_hip_tests(error, err_description); + + // GPU was not able to do the processing (HIP/rocBlas error(s) occurred) + if (error) { + string msg = "[" + action_name + "] " + MODULE_NAME + " " + + err_description; + rvs::lp::Log(msg, rvs::logerror); + return; + } + +} + +/** + * @brief forks and execs test result + * @param return true if test succeeded, false otherwise + */ + +bool hipTestWorker::start_hip_tests(int &error, string &errdesc){ + int pid, status; + auto found = m_test_path.find_last_of('/'); + auto fname = m_test_path.substr(found+1); + if((pid = fork()) == 0){ // child + execl(m_test_path.c_str(), fname.c_str(), m_test_args.c_str(), 0); + }else{ + waitpid(pid, &status, 0); + if (WIFEXITED(status)){ + error = 0; + return true; + } + return false; + } +} diff --git a/tfb.so/src/rvs_module.cpp b/tfb.so/src/rvs_module.cpp new file mode 100644 index 00000000..f96a16f1 --- /dev/null +++ b/tfb.so/src/rvs_module.cpp @@ -0,0 +1,91 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "include/rvs_module.h" +#include "include/action.h" +#include "include/rvsloglp.h" +#include "include/gpu_util.h" + +/** + * @defgroup GST GST Module + * + * @brief performs GPU Stress Test + * + * The GPU Stress Test runs a Graphics Stress test or SGEMM/DGEMM + * (Single/Double-precision General Matrix Multiplication) workload + * on one, some or all GPUs. The GPUs can be of the same or different types. + * The duration of the benchmark should be configurable, both in terms of time + * (how long to run) and iterations (how many times to run). + * + */ + +extern "C" int rvs_module_has_interface(int iid) { + int sts = 0; + switch (iid) { + case 0: + case 1: + sts = 1; + } + return sts; +} + +extern "C" const char* rvs_module_get_description(void) { + return "ROCm Validation Suite HipTest module"; +} + +extern "C" const char* rvs_module_get_config(void) { + return "test_path (string)" ; +} + +extern "C" const char* rvs_module_get_output(void) { + return "pass (bool)"; +} + +extern "C" int rvs_module_init(void* pMi) { + rvs::lp::Initialize(static_cast(pMi)); + rvs::gpulist::Initialize(); + return 0; +} + +extern "C" int rvs_module_terminate(void) { + return 0; +} + +extern "C" void* rvs_module_action_create(void) { + return static_cast(new tfb_action); +} + +extern "C" int rvs_module_action_destroy(void* pAction) { + delete static_cast(pAction); + return 0; +} + +extern "C" int rvs_module_action_property_set(void* pAction, const char* Key, + const char* Val) { + return static_cast(pAction)->property_set(Key, Val); +} + +extern "C" int rvs_module_action_run(void* pAction) { + return static_cast(pAction)->run(); +} diff --git a/tfb.so/src/tfb_worker.cpp b/tfb.so/src/tfb_worker.cpp new file mode 100644 index 00000000..5ec11185 --- /dev/null +++ b/tfb.so/src/tfb_worker.cpp @@ -0,0 +1,104 @@ +/* +Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// This program measures simultaneous copy performance across multiple GPUs +// on the same node +#include +#include +#include +#include +#include +#include "include/tfb_worker.h" +#include "include/rvsloglp.h" +#include "include/TransferBench.hpp" +#include "include/GetClosestNumaNode.hpp" + +static const std::string MODULE_NAME{"transfer-bench"}; +/** + * @brief performs the stress test on the given GPU + */ +void tfbWorker::run() { + std::string msg, err_description; + int error = 0; + + + // log GST stress test - start message + msg = "[" + action_name + "] " + MODULE_NAME + " " + + " Starting the Hip test "; + rvs::lp::Log(msg, rvs::logtrace); + + // let the GPU ramp-up and check the result + int sts = TfbRun(error, err_description); + + // GPU was not able to do the processing (HIP/rocBlas error(s) occurred) + if (error) { + std::string msg = "[" + action_name + "] " + MODULE_NAME + " " + + err_description; + rvs::lp::Log(msg, rvs::logerror); + return; + } + +} + + +int tfbWorker::TfbRun(int &err, std::string &errmsg) +{ + // Check for NUMA library support + if (numa_available() == -1) + { + printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n"); + exit(1); + } + + + // Collect environment variables / display current run configuration + EnvVars ev; + + // Determine number of bytes to run per Transfer + //size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER; + size_t numBytesPerTransfer = DEFAULT_BYTES_PER_TRANSFER*1024*1024; + /* + if (argc > 2) + { + // Adjust bytes if unit specified + char units = argv[2][strlen(argv[2])-1]; + switch (units) + { + case 'K': case 'k': numBytesPerTransfer *= 1024; break; + case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break; + case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break; + } + }*/ + if (numBytesPerTransfer % 4) + { + printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer); + exit(1); + } + + ev.configMode = CFG_P2P; + RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float)); + + + return 0; +} + + diff --git a/tfb.so/tests.cmake b/tfb.so/tests.cmake new file mode 100644 index 00000000..c1b6b74e --- /dev/null +++ b/tfb.so/tests.cmake @@ -0,0 +1,27 @@ +################################################################################ +## +## Copyright (c) 2018 ROCm Developer Tools +## +## MIT LICENSE: +## Permission is hereby granted, free of charge, to any person obtaining a copy of +## this software and associated documentation files (the "Software"), to deal in +## the Software without restriction, including without limitation the rights to +## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +## of the Software, and to permit persons to whom the Software is furnished to do +## so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +## SOFTWARE. +## +################################################################################ + + +include(tests_conf_logging) diff --git a/tfb.so/transferbench/CMakeLists.txt b/tfb.so/transferbench/CMakeLists.txt new file mode 100644 index 00000000..18df727b --- /dev/null +++ b/tfb.so/transferbench/CMakeLists.txt @@ -0,0 +1,31 @@ +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +if (DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory") +else() + set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory") +endif() +cmake_minimum_required(VERSION 3.5) + +project(TransferBench VERSION 1.0 LANGUAGES CXX) +set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib") +include_directories(${ROCM_PATH}/include) +link_libraries(numa hsa-runtime64 pthread) +set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..) +add_executable(TransferBench src/TransferBench.cpp) +target_include_directories(TransferBench PRIVATE src/include) + +find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH}) +include(ROCMInstallTargets) +include(ROCMCreatePackage) +set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF) + +set(PACKAGE_NAME TB) +set(LIBRARY_NAME TransferBench) + +rocm_install(TARGETS TransferBench) + +rocm_create_package( + NAME ${LIBRARY_NAME} + DESCRIPTION "TransferBench package" + MAINTAINER "RCCL Team " +)