From 42da22362c3303f9bef209e60108e5a643a1e5e0 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Wed, 2 Aug 2023 06:53:03 +0000 Subject: [PATCH 1/7] transfer bench code integration to rvs --- CMakeLists.txt | 2 +- transferbench.so/CMakeLists.txt | 182 ++ transferbench.so/include/Compatibility.hpp | 93 + transferbench.so/include/EnvVars.hpp | 510 ++++ .../include/GetClosestNumaNode.hpp | 149 + transferbench.so/include/Kernels.hpp | 409 +++ transferbench.so/include/TransferBench.hpp | 198 ++ transferbench.so/include/action.h | 86 + transferbench.so/include/rvs_module.h | 31 + transferbench.so/include/tfb_worker.h | 74 + transferbench.so/src/.gitignore | 1 + transferbench.so/src/action.cpp | 215 ++ transferbench.so/src/hiptest_worker.cpp | 96 + transferbench.so/src/rvs_module.cpp | 91 + transferbench.so/src/tfb_worker.cpp | 1953 +++++++++++++ transferbench.so/tests.cmake | 27 + transferbench.so/transferbench/CHANGELOG.md | 199 ++ transferbench.so/transferbench/CMakeLists.txt | 31 + transferbench.so/transferbench/LICENSE.md | 19 + transferbench.so/transferbench/README.md | 63 + .../transferbench/docs/.doxygen/Doxyfile | 2455 +++++++++++++++++ .../transferbench/docs/.sphinx/_toc.yml.in | 13 + .../docs/.sphinx/requirements.in | 1 + .../docs/.sphinx/requirements.txt | 155 ++ transferbench.so/transferbench/docs/api.rst | 5 + transferbench.so/transferbench/docs/conf.py | 14 + .../docs/examples/configfile_format.rst | 93 + .../transferbench/docs/examples/index.rst | 11 + transferbench.so/transferbench/docs/index.rst | 11 + .../transferbench/docs/instructions.rst | 36 + .../transferbench/examples/example.cfg | 79 + 31 files changed, 7301 insertions(+), 1 deletion(-) create mode 100644 transferbench.so/CMakeLists.txt create mode 100644 transferbench.so/include/Compatibility.hpp create mode 100644 transferbench.so/include/EnvVars.hpp create mode 100644 transferbench.so/include/GetClosestNumaNode.hpp create mode 100644 transferbench.so/include/Kernels.hpp create mode 100644 transferbench.so/include/TransferBench.hpp create mode 100644 transferbench.so/include/action.h create mode 100644 transferbench.so/include/rvs_module.h create mode 100644 transferbench.so/include/tfb_worker.h create mode 100644 transferbench.so/src/.gitignore create mode 100644 transferbench.so/src/action.cpp create mode 100644 transferbench.so/src/hiptest_worker.cpp create mode 100644 transferbench.so/src/rvs_module.cpp create mode 100644 transferbench.so/src/tfb_worker.cpp create mode 100644 transferbench.so/tests.cmake create mode 100644 transferbench.so/transferbench/CHANGELOG.md create mode 100644 transferbench.so/transferbench/CMakeLists.txt create mode 100644 transferbench.so/transferbench/LICENSE.md create mode 100644 transferbench.so/transferbench/README.md create mode 100644 transferbench.so/transferbench/docs/.doxygen/Doxyfile create mode 100644 transferbench.so/transferbench/docs/.sphinx/_toc.yml.in create mode 100644 transferbench.so/transferbench/docs/.sphinx/requirements.in create mode 100644 transferbench.so/transferbench/docs/.sphinx/requirements.txt create mode 100644 transferbench.so/transferbench/docs/api.rst create mode 100644 transferbench.so/transferbench/docs/conf.py create mode 100644 transferbench.so/transferbench/docs/examples/configfile_format.rst create mode 100644 transferbench.so/transferbench/docs/examples/index.rst create mode 100644 transferbench.so/transferbench/docs/index.rst create mode 100644 transferbench.so/transferbench/docs/instructions.rst create mode 100644 transferbench.so/transferbench/examples/example.cfg diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a750b23..e54fd195 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -474,7 +474,7 @@ add_subdirectory(rcqt.so) add_subdirectory(smqt.so) add_subdirectory(mem.so) add_subdirectory(babel.so) -#add_subdirectory(edp.so) +add_subdirectory(transferbench.so) add_subdirectory(perf.so) if (RVS_BUILD_TESTS) diff --git a/transferbench.so/CMakeLists.txt b/transferbench.so/CMakeLists.txt new file mode 100644 index 00000000..21165721 --- /dev/null +++ b/transferbench.so/CMakeLists.txt @@ -0,0 +1,182 @@ +################################################################################ +## +## Copyright (c) 2018 ROCm Developer Tools +## +## MIT LICENSE: +## Permission is hereby granted, free of charge, to any person obtaining a copy of +## this software and associated documentation files (the "Software"), to deal in +## the Software without restriction, including without limitation the rights to +## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +## of the Software, and to permit persons to whom the Software is furnished to do +## so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +## SOFTWARE. +## +################################################################################ + +cmake_minimum_required ( VERSION 3.5.0 ) +if ( ${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) + message(FATAL "In-source build is not allowed") +endif () +set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") + +set ( RVS "tfb" ) +set ( RVS_PACKAGE "rvs-roct" ) +set ( RVS_COMPONENT "lib${RVS}" ) +set ( RVS_TARGET "${RVS}" ) + +project ( ${RVS_TARGET} ) + +message(STATUS "MODULE: ${RVS}") +add_compile_options(-std=c++11) +add_compile_options(-Wall ) +if (RVS_COVERAGE) + add_compile_options(-o0 -fprofile-arcs -ftest-coverage) + set(CMAKE_EXE_LINKER_FLAGS "--coverage") + set(CMAKE_SHARED_LINKER_FLAGS "--coverage") +endif() + +## Set default module path if not already set +if ( NOT DEFINED CMAKE_MODULE_PATH ) + set ( CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake_modules/" ) +endif () + +## Include common cmake modules +include ( utils ) + +## Setup the package version. +get_version ( "0.0.0" ) + +set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) +set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) +set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) + +if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) + set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) +endif () +set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) + +## make version numbers visible to C code +add_compile_options(-DBUILD_VERSION_MAJOR=${VERSION_MAJOR}) +add_compile_options(-DBUILD_VERSION_MINOR=${VERSION_MINOR}) +add_compile_options(-DBUILD_VERSION_PATCH=${VERSION_PATCH}) +add_compile_options(-DLIB_VERSION_STRING="${LIB_VERSION_STRING}") +add_compile_options(-DBUILD_VERSION_STRING="${BUILD_VERSION_STRING}") + +set(ROCBLAS_LIB "rocblas") +set(HIP_HCC_LIB "amdhip64") + +# Determine HSA_PATH +if(NOT DEFINED HIPCC_PATH) + if(NOT DEFINED ENV{HIPCC_PATH}) + set(HIPCC_PATH "${ROCM_PATH}/hip" CACHE PATH "Path to which hipcc runtime has been installed") + else() + set(HIPCC_PATH $ENV{HIPCC_PATH} CACHE PATH "Path to which hipcc runtime has been installed") + endif() +endif() + +# Determine HSA_PATH +if(NOT DEFINED HSA_PATH) + if(NOT DEFINED ENV{HSA_PATH}) + set(HSA_PATH "${ROCM_PATH}/hsa" CACHE PATH "Path to which HSA runtime has been installed") + else() + set(HSA_PATH $ENV{HSA_PATH} CACHE PATH "Path to which HSA runtime has been installed") + endif() +endif() + +# Add HIP_VERSION to CMAKE__FLAGS +set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}") + +set(HIP_HCC_BUILD_FLAGS) +set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include ${ASAN_CXX_FLAGS}") + +# Set compiler and compiler flags +set(CMAKE_CXX_COMPILER "${HIPCC_PATH}/bin/hipcc") +set(CMAKE_C_COMPILER "${HIPCC_PATH}/bin/hipcc") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS} -L${ROCM_PATH}/lib") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LD_FLAGS}") +set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LD_FLAGS}") + +if(BUILD_ADDRESS_SANITIZER) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.asan-x86_64.so + OUTPUT_VARIABLE ASAN_LIB_FULL_PATH) + get_filename_component(ASAN_LIB_PATH ${ASAN_LIB_FULL_PATH} DIRECTORY) +endif() + +# Determine Roc Runtime header files are accessible +if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h) + message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR}) + RETURN() +endif() + +if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h) + message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR}) + RETURN() +endif() + +# Determine Roc Runtime header files are accessible +if(DEFINED RVS_ROCMSMI) + if(NOT RVS_ROCMSMI EQUAL 1) + if(NOT EXISTS ${ROCBLAS_INC_DIR}/rocblas.h) + message("ERROR: rocBLAS headers can't be found under specified path. Please set ROCBLAS_INC_DIR path. Current value is : " ${ROCBLAS_INC_DIR}) + RETURN() + endif() + + if(NOT EXISTS "${ROCBLAS_LIB_DIR}/lib${ROCBLAS_LIB}.so") + message("ERROR: rocBLAS library can't be found under specified path. Please set ROCBLAS_LIB_DIR path. Current value is : " ${ROCBLAS_LIB_DIR}) + RETURN() + endif() + endif() +endif() + + +if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() +endif() + +## define include directories +include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${ROCM_PATH}/include) +# Add directories to look for library files to link +link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) +## additional libraries +set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so numa hsa-runtime64 ) + +## define source files +set(SOURCES src/rvs_module.cpp src/action.cpp src/tfb_worker.cpp) + +## define target +add_library( ${RVS_TARGET} SHARED ${SOURCES}) +set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB}) +add_dependencies(${RVS_TARGET} rvslibrt rvslib) + +add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +COMMAND ln -fs ./lib${RVS}.so.${LIB_VERSION_STRING} lib${RVS}.so.${VERSION_MAJOR} WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} +COMMAND ln -fs ./lib${RVS}.so.${VERSION_MAJOR} lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} +) + +install(TARGETS ${RVS_TARGET} LIBRARY DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule) +install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR}" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule) +install(FILES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so" DESTINATION ${CMAKE_PACKAGING_INSTALL_PREFIX}/rvs COMPONENT rvsmodule) + +# TEST SECTION +#if (RVS_BUILD_TESTS) +# add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +# COMMAND ln -fs ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/lib${RVS}.so.${VERSION_MAJOR} ${RVS_BINTEST_FOLDER}/lib${RVS}.so WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} +# ) +# include(${CMAKE_CURRENT_SOURCE_DIR}/tests.cmake) +#endif() diff --git a/transferbench.so/include/Compatibility.hpp b/transferbench.so/include/Compatibility.hpp new file mode 100644 index 00000000..5e76cf50 --- /dev/null +++ b/transferbench.so/include/Compatibility.hpp @@ -0,0 +1,93 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#if defined(__NVCC__) + +#include + +// ROCm specific +#define __builtin_amdgcn_s_memrealtime clock64 +#define gcnArchName name + +// Datatypes +#define hipDeviceProp_t cudaDeviceProp +#define hipError_t cudaError_t +#define hipEvent_t cudaEvent_t +#define hipStream_t cudaStream_t + +// Enumerations +#define hipDeviceAttributeClockRate cudaDevAttrClockRate +#define hipDeviceAttributeMaxSharedMemoryPerMultiprocessor cudaDevAttrMaxSharedMemoryPerMultiprocessor +#define hipDeviceAttributeMultiprocessorCount cudaDevAttrMultiProcessorCount +#define hipErrorPeerAccessAlreadyEnabled cudaErrorPeerAccessAlreadyEnabled +#define hipFuncCachePreferShared cudaFuncCachePreferShared +#define hipMemcpyDefault cudaMemcpyDefault +#define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define hipMemcpyHostToDevice cudaMemcpyHostToDevice +#define hipSuccess cudaSuccess + +// Functions +#define hipDeviceCanAccessPeer cudaDeviceCanAccessPeer +#define hipDeviceEnablePeerAccess cudaDeviceEnablePeerAccess +#define hipDeviceGetAttribute cudaDeviceGetAttribute +#define hipDeviceGetPCIBusId cudaDeviceGetPCIBusId +#define hipDeviceSetCacheConfig cudaDeviceSetCacheConfig +#define hipDeviceSynchronize cudaDeviceSynchronize +#define hipEventCreate cudaEventCreate +#define hipEventDestroy cudaEventDestroy +#define hipEventElapsedTime cudaEventElapsedTime +#define hipEventRecord cudaEventRecord +#define hipFree cudaFree +#define hipGetDeviceCount cudaGetDeviceCount +#define hipGetDeviceProperties cudaGetDeviceProperties +#define hipGetErrorString cudaGetErrorString +#define hipHostFree cudaFreeHost +#define hipHostMalloc cudaMallocHost +#define hipMalloc cudaMalloc +#define hipMemcpy cudaMemcpy +#define hipMemcpyAsync cudaMemcpyAsync +#define hipMemset cudaMemset +#define hipMemsetAsync cudaMemsetAsync +#define hipSetDevice cudaSetDevice +#define hipStreamCreate cudaStreamCreate +#define hipStreamDestroy cudaStreamDestroy +#define hipStreamSynchronize cudaStreamSynchronize + +// Define float4 addition operator for NVIDIA platform +__device__ inline float4& operator +=(float4& a, const float4& b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; + return a; +} + +#else + +#include +#include +#include + +#endif diff --git a/transferbench.so/include/EnvVars.hpp b/transferbench.so/include/EnvVars.hpp new file mode 100644 index 00000000..bac0746e --- /dev/null +++ b/transferbench.so/include/EnvVars.hpp @@ -0,0 +1,510 @@ +/* +Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef ENVVARS_HPP +#define ENVVARS_HPP + +#include +#include +#include +#include "Compatibility.hpp" +#include "Kernels.hpp" + +#define TB_VERSION "1.22" + +extern char const MemTypeStr[]; +extern char const ExeTypeStr[]; + +enum ConfigModeEnum +{ + CFG_FILE = 0, + CFG_P2P = 1, + CFG_SWEEP = 2 +}; + +// This class manages environment variable that affect TransferBench +class EnvVars +{ +public: + // Default configuration values + int const DEFAULT_NUM_WARMUPS = 1; + int const DEFAULT_NUM_ITERATIONS = 10; + int const DEFAULT_SAMPLING_FACTOR = 1; + + // Peer-to-peer Benchmark preset defaults + int const DEFAULT_P2P_NUM_CPU_SE = 4; + + // Sweep-preset defaults + std::string const DEFAULT_SWEEP_SRC = "CG"; + std::string const DEFAULT_SWEEP_EXE = "CDG"; + std::string const DEFAULT_SWEEP_DST = "CG"; + int const DEFAULT_SWEEP_MIN = 1; + int const DEFAULT_SWEEP_MAX = 24; + int const DEFAULT_SWEEP_TEST_LIMIT = 0; + int const DEFAULT_SWEEP_TIME_LIMIT = 0; + + // Environment variables + int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy + int byteOffset; // Byte-offset for memory allocations + int continueOnError; // Continue tests even after mismatch detected + int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected) + int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected) + int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead + int numWarmups; // Number of un-timed warmup iterations to perform + int outputToCsv; // Output in CSV format + int samplingFactor; // Affects how many different values of N are generated (when N set to 0) + int sharedMemBytes; // Amount of shared memory to use per threadblock + int useInteractive; // Pause for user-input before starting transfer loop + int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device + int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern) + int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer + int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host + + std::vector fillPattern; // Pattern of floats used to fill source data + + // Environment variables only for Benchmark-preset + int useRemoteRead; // Use destination memory type as executor instead of source memory type + int useDmaCopy; // Use DMA copy instead of GPU copy + int numGpuSubExecs; // Number of GPU subexecutors to use + int numCpuSubExecs; // Number of CPU subexecttors to use + + // Environment variables only for Sweep-preset + int sweepMin; // Min number of simultaneous Transfers to be executed per test + int sweepMax; // Max number of simulatneous Transfers to be executed per test + int sweepTestLimit; // Max number of tests to run during sweep (0 = no limit) + int sweepTimeLimit; // Max number of seconds to run sweep for (0 = no limit) + int sweepXgmiMin; // Min number of XGMI hops for Transfers + int sweepXgmiMax; // Max number of XGMI hops for Transfers (-1 = no limit) + int sweepSeed; // Random seed to use + int sweepRandBytes; // Whether or not to use random number of bytes per Transfer + std::string sweepSrc; // Set of src memory types to be swept + std::string sweepExe; // Set of executors to be swept + std::string sweepDst; // Set of dst memory types to be swept + + // Developer features + int enableDebug; // Enable debug output + int gpuKernel; // Which GPU kernel to use + + // Used to track current configuration mode + ConfigModeEnum configMode; + + // Random generator + std::default_random_engine *generator; + + // Track how many CPUs are available per NUMA node + std::vector numCpusPerNuma; + + // Constructor that collects values + EnvVars() + { + int maxSharedMemBytes = 0; + HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes, + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0)); +#if !defined(__NVCC__) + int defaultSharedMemBytes = maxSharedMemBytes / 2 + 1; +#else + int defaultSharedMemBytes = 0; +#endif + + int numDeviceCUs = 0; + HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0)); + + int numDetectedCpus = numa_num_configured_nodes(); + int numDetectedGpus; + HIP_CALL(hipGetDeviceCount(&numDetectedGpus)); + + hipDeviceProp_t prop; + HIP_CALL(hipGetDeviceProperties(&prop, 0)); + std::string fullName = prop.gcnArchName; + std::string archName = fullName.substr(0, fullName.find(':')); + + // Different hardware pick different GPU kernels + // This performance difference is generally only noticable when executing fewer CUs + int defaultGpuKernel = 0; + if (archName == "gfx906") defaultGpuKernel = 13; + else if (archName == "gfx90a") defaultGpuKernel = 9; + + blockBytes = GetEnvVar("BLOCK_BYTES" , 256); + byteOffset = GetEnvVar("BYTE_OFFSET" , 0); + continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0); + numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus); + numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus); + numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS); + numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS); + outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0); + samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR); + sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , defaultSharedMemBytes); + useInteractive = GetEnvVar("USE_INTERACTIVE" , 0); + usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); + usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0); + useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0); + validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0); + enableDebug = GetEnvVar("DEBUG" , 0); + gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel); + + // P2P Benchmark related + useRemoteRead = GetEnvVar("USE_REMOTE_READ" , 0); + useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); + numGpuSubExecs = GetEnvVar("NUM_GPU_SE" , useDmaCopy ? 1 : numDeviceCUs); + numCpuSubExecs = GetEnvVar("NUM_CPU_SE" , DEFAULT_P2P_NUM_CPU_SE); + + // Sweep related + sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN); + sweepMax = GetEnvVar("SWEEP_MAX" , DEFAULT_SWEEP_MAX); + sweepSrc = GetEnvVar("SWEEP_SRC" , DEFAULT_SWEEP_SRC); + sweepExe = GetEnvVar("SWEEP_EXE" , DEFAULT_SWEEP_EXE); + sweepDst = GetEnvVar("SWEEP_DST" , DEFAULT_SWEEP_DST); + sweepTestLimit = GetEnvVar("SWEEP_TEST_LIMIT" , DEFAULT_SWEEP_TEST_LIMIT); + sweepTimeLimit = GetEnvVar("SWEEP_TIME_LIMIT" , DEFAULT_SWEEP_TIME_LIMIT); + sweepXgmiMin = GetEnvVar("SWEEP_XGMI_MIN" , 0); + sweepXgmiMax = GetEnvVar("SWEEP_XGMI_MAX" , -1); + sweepRandBytes = GetEnvVar("SWEEP_RAND_BYTES" , 0); + + // Determine random seed + char *sweepSeedStr = getenv("SWEEP_SEED"); + sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL)); + generator = new std::default_random_engine(sweepSeed); + + // Check for fill pattern + char* pattern = getenv("FILL_PATTERN"); + if (pattern != NULL) + { + if (usePrepSrcKernel) + { + printf("[ERROR] Unable to use FILL_PATTERN and USE_PREP_KERNEL together\n"); + exit(1); + } + + int patternLen = strlen(pattern); + if (patternLen % 2) + { + printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n"); + exit(1); + } + + // Read in bytes + std::vector bytes; + unsigned char val = 0; + for (int i = 0; i < patternLen; i++) + { + if ('0' <= pattern[i] && pattern[i] <= '9') + val += (pattern[i] - '0'); + else if ('A' <= pattern[i] && pattern[i] <= 'F') + val += (pattern[i] - 'A' + 10); + else if ('a' <= pattern[i] && pattern[i] <= 'f') + val += (pattern[i] - 'a' + 10); + else + { + printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F). (not %c)\n", pattern[i]); + exit(1); + } + + if (i % 2 == 0) + val <<= 4; + else + { + bytes.push_back(val); + val = 0; + } + } + + // Reverse bytes (input is assumed to be given in big-endian) + std::reverse(bytes.begin(), bytes.end()); + + // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly + int copies; + switch (patternLen % 8) + { + case 0: copies = 1; break; + case 4: copies = 2; break; + default: copies = 4; break; + } + + // Fill floats + int numFloats = copies * patternLen / 8; + fillPattern.resize(numFloats); + unsigned char* rawData = (unsigned char*) fillPattern.data(); + for (int i = 0; i < numFloats * 4; i++) + rawData[i] = bytes[i % bytes.size()]; + } + else fillPattern.clear(); + + // Perform some basic validation + if (numCpuDevices > numDetectedCpus) + { + printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus); + exit(1); + } + if (numGpuDevices > numDetectedGpus) + { + printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus); + exit(1); + } + if (byteOffset % sizeof(float)) + { + printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float)); + exit(1); + } + if (numWarmups < 0) + { + printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n"); + exit(1); + } + if (samplingFactor < 1) + { + printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n"); + exit(1); + } + if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes) + { + printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes); + exit(1); + } + if (blockBytes <= 0 || blockBytes % 4) + { + printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n"); + exit(1); + } + + if (numGpuSubExecs <= 0) + { + printf("[ERROR] NUM_GPU_SE must be greater than 0\n"); + exit(1); + } + + if (numCpuSubExecs <= 0) + { + printf("[ERROR] NUM_CPU_SE must be greater than 0\n"); + exit(1); + } + + for (auto ch : sweepSrc) + { + if (!strchr(MemTypeStr, ch)) + { + printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch); + exit(1); + } + if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) + { + printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch); + exit(1); + } + } + + for (auto ch : sweepDst) + { + if (!strchr(MemTypeStr, ch)) + { + printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch); + exit(1); + } + if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) + { + printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch); + exit(1); + } + } + + for (auto ch : sweepExe) + { + if (!strchr(ExeTypeStr, ch)) + { + printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch); + exit(1); + } + if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) + { + printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch); + exit(1); + } + } + if (gpuKernel < 0 || gpuKernel > NUM_GPU_KERNELS) + { + printf("[ERROR] GPU kernel must be between 0 and %d\n", NUM_GPU_KERNELS); + exit(1); + } + + // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs) + numCpusPerNuma.resize(numDetectedCpus); + int const totalCpus = numa_num_configured_cpus(); + for (int i = 0; i < totalCpus; i++) + numCpusPerNuma[numa_node_of_cpu(i)]++; + + // Check for deprecated env vars + if (getenv("USE_HIP_CALL")) + { + printf("[WARN] USE_HIP_CALL has been deprecated. Please use DMA executor 'D' or set USE_GPU_DMA for P2P-Benchmark preset\n"); + exit(1); + } + + char* enableSdma = getenv("HSA_ENABLE_SDMA"); + if (enableSdma && !strcmp(enableSdma, "0")) + { + printf("[WARN] DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0. Copies will fallback to blit kernels\n"); + } + } + + // Display info on the env vars that can be used + static void DisplayUsage() + { + printf("Environment variables:\n"); + printf("======================\n"); + printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n"); + printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n"); + printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n"); + printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n"); + printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n"); + printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n"); + printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n"); + printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n"); + printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); + printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n"); + printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n"); + printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); + printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n"); + printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n"); + printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n"); + printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n"); + } + + // Helper macro to switch between CSV and terminal output +#define PRINT_EV(NAME, VALUE, DESCRIPTION) \ + printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ", (DESCRIPTION).c_str()) + +#define PRINT_ES(NAME, VALUE, DESCRIPTION) \ + printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ", (DESCRIPTION).c_str()) + + // Display env var settings + void DisplayEnvVars() const + { + if (!outputToCsv) + { + printf("TransferBench v%s\n", TB_VERSION); + printf("=====================================================\n"); + printf("[Common]\n"); + } + else + printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION); + + PRINT_EV("BLOCK_BYTES", blockBytes, + std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy")); + PRINT_EV("BYTE_OFFSET", byteOffset, + std::string("Using byte offset of " + std::to_string(byteOffset))); + PRINT_EV("CONTINUE_ON_ERROR", continueOnError, + std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error")); + PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0, + (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString())); + PRINT_EV("GPU_KERNEL", gpuKernel, + std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]"); + PRINT_EV("NUM_CPU_DEVICES", numCpuDevices, + std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices"); + PRINT_EV("NUM_GPU_DEVICES", numGpuDevices, + std::string("Using ") + std::to_string(numGpuDevices) + " GPU devices"); + PRINT_EV("NUM_ITERATIONS", numIterations, + std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " " + + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test")); + PRINT_EV("NUM_WARMUPS", numWarmups, + std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test")); + PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes, + std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock")); + PRINT_EV("USE_INTERACTIVE", useInteractive, + std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode"); + PRINT_EV("USE_PCIE_INDEX", usePcieIndexing, + std::string("Use ") + (usePcieIndexing ? "PCIe" : "HIP") + " GPU device indexing"); + PRINT_EV("USE_PREP_KERNEL", usePrepSrcKernel, + std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data"); + PRINT_EV("USE_SINGLE_STREAM", useSingleStream, + std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer")); + PRINT_EV("VALIDATE_DIRECT", validateDirect, + std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer")); + printf("\n"); + }; + + // Display env var for P2P Benchmark preset + void DisplayP2PBenchmarkEnvVars() const + { + DisplayEnvVars(); + + if (!outputToCsv) + printf("[P2P Related]\n"); + + PRINT_EV("NUM_CPU_SE", numCpuSubExecs, + std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors"); + PRINT_EV("NUM_GPU_SE", numGpuSubExecs, + std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors"); + PRINT_EV("USE_GPU_DMA", useDmaCopy, + std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor"); + PRINT_EV("USE_REMOTE_READ", useRemoteRead, + std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor"); + printf("\n"); + } + + // Display env var settings + void DisplaySweepEnvVars() const + { + DisplayEnvVars(); + + if (!outputToCsv) + printf("[Sweep Related]\n"); + PRINT_ES("SWEEP_DST", sweepDst.c_str(), + std::string("Destination Memory Types to sweep")); + PRINT_ES("SWEEP_EXE", sweepExe.c_str(), + std::string("Executor Types to sweep")); + PRINT_EV("SWEEP_MAX", sweepMax, + std::string("Max simultaneous transfers (0 = no limit)")); + PRINT_EV("SWEEP_MIN", sweepMin, + std::string("Min simultaenous transfers")); + PRINT_EV("SWEEP_RAND_BYTES", sweepRandBytes, + std::string("Using ") + (sweepRandBytes ? "random" : "constant") + " number of bytes per Transfer"); + PRINT_EV("SWEEP_SEED", sweepSeed, + std::string("Random seed set to ") + std::to_string(sweepSeed)); + PRINT_ES("SWEEP_SRC", sweepSrc.c_str(), + std::string("Source Memory Types to sweep")); + PRINT_EV("SWEEP_TEST_LIMIT", sweepTestLimit, + std::string("Max number of tests to run during sweep (0 = no limit)")); + PRINT_EV("SWEEP_TIME_LIMIT", sweepTimeLimit, + std::string("Max number of seconds to run sweep for (0 = no limit)")); + PRINT_EV("SWEEP_XGMI_MAX", sweepXgmiMax, + std::string("Max number of XGMI hops for Transfers (-1 = no limit)")); + PRINT_EV("SWEEP_XGMI_MIN", sweepXgmiMin, + std::string("Min number of XGMI hops for Transfers")); + printf("\n"); + } + + // Helper function that gets parses environment variable or sets to default value + static int GetEnvVar(std::string const& varname, int defaultValue) + { + if (getenv(varname.c_str())) + return atoi(getenv(varname.c_str())); + return defaultValue; + } + + static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue) + { + if (getenv(varname.c_str())) + return getenv(varname.c_str()); + return defaultValue; + } +}; + +#endif diff --git a/transferbench.so/include/GetClosestNumaNode.hpp b/transferbench.so/include/GetClosestNumaNode.hpp new file mode 100644 index 00000000..266f1eb1 --- /dev/null +++ b/transferbench.so/include/GetClosestNumaNode.hpp @@ -0,0 +1,149 @@ +/* +Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Helper macro for checking HSA calls +#define HSA_CHECK(cmd) \ + do { \ + hsa_status_t error = (cmd); \ + if (error != HSA_STATUS_SUCCESS) { \ + const char* errString = NULL; \ + hsa_status_string(error, &errString); \ + std::cerr << "Encountered HSA error (" << errString << ") at line " \ + << __LINE__ << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while (0) + +// Structure to hold HSA agent information +#if !defined(__NVCC__) +struct AgentData +{ + bool isInitialized; + std::vector cpuAgents; + std::vector gpuAgents; + std::vector closestNumaNode; +}; + +// Simple callback function to return any memory pool for an agent +hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data) +{ + hsa_amd_memory_pool_t* poolData = reinterpret_cast(data); + + // Check memory pool flags + uint32_t poolFlags; + HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags)); + + // Only consider coarse-grained pools + if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS; + + *poolData = pool; + return HSA_STATUS_SUCCESS; +} + +// Callback function to gather HSA agent information +hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data) +{ + AgentData* agentData = reinterpret_cast(data); + + // Get the device type + hsa_device_type_t deviceType; + HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType)); + if (deviceType == HSA_DEVICE_TYPE_CPU) + agentData->cpuAgents.push_back(agent); + if (deviceType == HSA_DEVICE_TYPE_GPU) + { + agentData->gpuAgents.push_back(agent); + agentData->closestNumaNode.push_back(0); + } + + return HSA_STATUS_SUCCESS; +} + +AgentData& GetAgentData() +{ + static AgentData agentData = {}; + + if (!agentData.isInitialized) + { + agentData.isInitialized = true; + + // Add all detected agents to the list + HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData)); + + // Loop over each GPU + for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++) + { + // Collect memory pool + hsa_amd_memory_pool_t pool; + HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool)); + + // Loop over each CPU agent and check distance + int bestDistance = -1; + for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++) + { + // Determine number of hops from GPU memory pool to CPU agent + uint32_t hops = 0; + HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j], + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, + &hops)); + // Gather link info + hsa_amd_memory_pool_link_info_t* link_info = + (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t)); + HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j], + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, + link_info)); + int numaDist = 0; + for (int k = 0; k < hops; k++) + { + numaDist += link_info[k].numa_distance; + } + if (bestDistance == -1 || numaDist < bestDistance) + { + agentData.closestNumaNode[i] = j; + bestDistance = numaDist; + } + free(link_info); + } + } + } + return agentData; +} +#endif + +// Returns closest CPU NUMA node to provided GPU +// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing +int GetClosestNumaNode(int gpuIdx) +{ +#if defined(__NVCC__) + return -1; +#else + AgentData& agentData = GetAgentData(); + if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size()) + { + printf("[ERROR] GPU index out is out of bounds\n"); + exit(1); + } + return agentData.closestNumaNode[gpuIdx]; +#endif +} diff --git a/transferbench.so/include/Kernels.hpp b/transferbench.so/include/Kernels.hpp new file mode 100644 index 00000000..dcb6bf07 --- /dev/null +++ b/transferbench.so/include/Kernels.hpp @@ -0,0 +1,409 @@ +/* +Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#define PackedFloat_t float4 +#define WARP_SIZE 64 +#define BLOCKSIZE 256 +#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float)) +#define MEMSET_CHAR 75 +#define MEMSET_VAL 13323083.0f + +// Each subExecutor is provided with subarrays to work on +#define MAX_SRCS 16 +#define MAX_DSTS 16 +struct SubExecParam +{ + size_t N; // Number of floats this subExecutor works on + int numSrcs; // Number of source arrays + int numDsts; // Number of destination arrays + float* src[MAX_SRCS]; // Source array pointers + float* dst[MAX_DSTS]; // Destination array pointers + long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor) + long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor) +}; + +void CpuReduceKernel(SubExecParam const& p) +{ + int const& numSrcs = p.numSrcs; + int const& numDsts = p.numDsts; + + if (numSrcs == 0) + { + for (int i = 0; i < numDsts; ++i) + memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float)); + } + else if (numSrcs == 1) + { + float const* __restrict__ src = p.src[0]; + for (int i = 0; i < numDsts; ++i) + { + memcpy(p.dst[i], src, p.N * sizeof(float)); + } + } + else + { + for (int j = 0; j < p.N; j++) + { + float sum = p.src[0][j]; + for (int i = 1; i < numSrcs; i++) sum += p.src[i][j]; + for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum; + } + } +} + +std::string PrepSrcValueString() +{ + return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)"; +} + +__host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx) +{ + return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1); +} + +// GPU kernel to prepare src buffer data +__global__ void +PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx) +{ + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < N; + idx += blockDim.x * gridDim.x) + { + ptr[idx] = PrepSrcValue(srcBufferIdx, idx); + } +} + +// Helper function for memset +template __device__ __forceinline__ T MemsetVal(); +template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; }; +template <> __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); } + +// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats +template +__global__ void __launch_bounds__(BLOCKSIZE) +GpuReduceKernel(SubExecParam* params) +{ + int64_t startCycle = wall_clock64(); + + // Operate on wavefront granularity + SubExecParam& p = params[blockIdx.x]; + int const numSrcs = p.numSrcs; + int const numDsts = p.numDsts; + int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number + int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront + + // 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration + // Determine the number of packed floats processed by the first loop + size_t Nrem = p.N; + size_t const loop1Npack = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE); + size_t const loop1Nelem = loop1Npack * FLOATS_PER_PACK; + size_t const loop1Inc = BLOCKSIZE * LOOP1_UNROLL; + size_t loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId; + + while (loop1Offset < loop1Npack) + { + PackedFloat_t vals[LOOP1_UNROLL] = {}; + + if (numSrcs == 0) + { + #pragma unroll + for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal(); + } + else + { + for (int i = 0; i < numSrcs; ++i) + { + PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset; + #pragma unroll + for (int u = 0; u < LOOP1_UNROLL; ++u) + vals[u] += *(packedSrc + u * WARP_SIZE); + } + } + + for (int i = 0; i < numDsts; ++i) + { + PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset; + #pragma unroll + for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u]; + } + loop1Offset += loop1Inc; + } + Nrem -= loop1Nelem; + + if (Nrem > 0) + { + // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration + // NOTE: Using int32_t due to smaller size requirements + int32_t const loop2Npack = Nrem / FLOATS_PER_PACK; + int32_t const loop2Nelem = loop2Npack * FLOATS_PER_PACK; + int32_t const loop2Inc = BLOCKSIZE; + int32_t loop2Offset = threadIdx.x; + + while (loop2Offset < loop2Npack) + { + PackedFloat_t val; + if (numSrcs == 0) + { + val = MemsetVal(); + } + else + { + val = {}; + for (int i = 0; i < numSrcs; ++i) + { + PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset; + val += *packedSrc; + } + } + + for (int i = 0; i < numDsts; ++i) + { + PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset; + *packedDst = val; + } + loop2Offset += loop2Inc; + } + Nrem -= loop2Nelem; + + // Deal with leftovers less than FLOATS_PER_PACK) + if (threadIdx.x < Nrem) + { + int offset = loop1Nelem + loop2Nelem + threadIdx.x; + float val = 0; + if (numSrcs == 0) + { + val = MEMSET_VAL; + } + else + { + for (int i = 0; i < numSrcs; ++i) + val += p.src[i][offset]; + } + + for (int i = 0; i < numDsts; ++i) + p.dst[i][offset] = val; + } + } + + __syncthreads(); + if (threadIdx.x == 0) + { + p.startCycle = startCycle; + p.stopCycle = wall_clock64(); + } +} + +template +__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N) +{ + int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread + size_t constexpr loopPackInc = BLOCKSIZE * UNROLL_FACTOR; + size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR; + int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number + int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront + int const numSrcs = p.numSrcs; + int const numDsts = p.numDsts; + size_t const numPacksDone = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave; + size_t const numFloatsLeft = N - numPacksDone * numFloatsPerPack; + size_t loopPackOffset = waveId * numPacksPerWave + threadId; + + while (loopPackOffset < numPacksDone) + { + FLOAT_TYPE vals[UNROLL_FACTOR]; + + if (numSrcs == 0) + { + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal(); + } + else + { + FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset; + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + vals[u] = *(src0Ptr + u * WARP_SIZE); + + for (int i = 1; i < numSrcs; ++i) + { + FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset; + + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + vals[u] += *(srcPtr + u * WARP_SIZE); + } + } + + for (int i = 0; i < numDsts; ++i) + { + FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset; + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + *(dstPtr + u * WARP_SIZE) = vals[u]; + } + loopPackOffset += loopPackInc; + } + + return numFloatsLeft; +} + +template +__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N) +{ + // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop + int constexpr numFloatsPerRead = sizeof(FLOAT_TYPE) / sizeof(float); + size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead; + size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR; + size_t const numFloatsLeft = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop; + size_t const numFloatsDone = N - numFloatsLeft; + int const numSrcs = p.numSrcs; + int const numDsts = p.numDsts; + + for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop) + { + FLOAT_TYPE tmp[UNROLL_FACTOR]; + + if (numSrcs == 0) + { + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + tmp[u] = MemsetVal(); + } + else + { + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop])); + + for (int i = 1; i < numSrcs; ++i) + { + #pragma unroll UNROLL_FACTOR + for (int u = 0; u < UNROLL_FACTOR; ++u) + tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop])); + } + } + + for (int i = 0; i < numDsts; ++i) + { + for (int u = 0; u < UNROLL_FACTOR; ++u) + { + *((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u]; + } + } + } + return numFloatsLeft; +} + +template +__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll) +{ + switch (unroll) + { + case 1: return GpuReduceFuncImpl(p, offset, N); + case 2: return GpuReduceFuncImpl(p, offset, N); + case 3: return GpuReduceFuncImpl(p, offset, N); + case 4: return GpuReduceFuncImpl(p, offset, N); + case 5: return GpuReduceFuncImpl(p, offset, N); + case 6: return GpuReduceFuncImpl(p, offset, N); + case 7: return GpuReduceFuncImpl(p, offset, N); + case 8: return GpuReduceFuncImpl(p, offset, N); + case 9: return GpuReduceFuncImpl(p, offset, N); + case 10: return GpuReduceFuncImpl(p, offset, N); + case 11: return GpuReduceFuncImpl(p, offset, N); + case 12: return GpuReduceFuncImpl(p, offset, N); + case 13: return GpuReduceFuncImpl(p, offset, N); + case 14: return GpuReduceFuncImpl(p, offset, N); + case 15: return GpuReduceFuncImpl(p, offset, N); + case 16: return GpuReduceFuncImpl(p, offset, N); + default: return GpuReduceFuncImpl(p, offset, N); + } +} + +// GPU copy kernel +__global__ void __launch_bounds__(BLOCKSIZE) +GpuReduceKernel2(SubExecParam* params) +{ + int64_t startCycle = wall_clock64(); + SubExecParam& p = params[blockIdx.x]; + + size_t numFloatsLeft = GpuReduceFunc(p, 0, p.N, 8); + if (numFloatsLeft) + numFloatsLeft = GpuReduceFunc(p, p.N - numFloatsLeft, numFloatsLeft, 1); + + if (numFloatsLeft) + GpuReduceFunc(p, p.N - numFloatsLeft, numFloatsLeft, 1); + + __threadfence_system(); + if (threadIdx.x == 0) + { + p.startCycle = startCycle; + p.stopCycle = wall_clock64(); + } +} + +#define NUM_GPU_KERNELS 18 +typedef void (*GpuKernelFuncPtr)(SubExecParam*); + +GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] = +{ + GpuReduceKernel<8>, + GpuReduceKernel<1>, + GpuReduceKernel<2>, + GpuReduceKernel<3>, + GpuReduceKernel<4>, + GpuReduceKernel<5>, + GpuReduceKernel<6>, + GpuReduceKernel<7>, + GpuReduceKernel<8>, + GpuReduceKernel<9>, + GpuReduceKernel<10>, + GpuReduceKernel<11>, + GpuReduceKernel<12>, + GpuReduceKernel<13>, + GpuReduceKernel<14>, + GpuReduceKernel<15>, + GpuReduceKernel<16>, + GpuReduceKernel2 +}; + +std::string GpuKernelNames[NUM_GPU_KERNELS] = +{ + "Default - 8xUnroll", + "Unroll x1", + "Unroll x2", + "Unroll x3", + "Unroll x4", + "Unroll x5", + "Unroll x6", + "Unroll x7", + "Unroll x8", + "Unroll x9", + "Unroll x10", + "Unroll x11", + "Unroll x12", + "Unroll x13", + "Unroll x14", + "Unroll x15", + "Unroll x16", + "8xUnrollB", +}; diff --git a/transferbench.so/include/TransferBench.hpp b/transferbench.so/include/TransferBench.hpp new file mode 100644 index 00000000..6abbbc5b --- /dev/null +++ b/transferbench.so/include/TransferBench.hpp @@ -0,0 +1,198 @@ +/* +Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Compatibility.hpp" + +// Helper macro for catching HIP errors +#define HIP_CALL(cmd) \ + do { \ + hipError_t error = (cmd); \ + if (error != hipSuccess) \ + { \ + std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \ + << ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while (0) + +#include "EnvVars.hpp" + +// Simple configuration parameters +size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26); // Amount of data transferred per Transfer + +// Different src/dst memory types supported +typedef enum +{ + MEM_CPU = 0, // Coarse-grained pinned CPU memory + MEM_GPU = 1, // Coarse-grained global GPU memory + MEM_CPU_FINE = 2, // Fine-grained pinned CPU memory + MEM_GPU_FINE = 3, // Fine-grained global GPU memory + MEM_CPU_UNPINNED = 4, // Unpinned CPU memory + MEM_NULL = 5, // NULL memory - used for empty +} MemType; + +typedef enum +{ + EXE_CPU = 0, // CPU executor (subExecutor = CPU thread) + EXE_GPU_GFX = 1, // GPU kernel-based executor (subExecutor = threadblock/CU) + EXE_GPU_DMA = 2, // GPU SDMA-based executor (subExecutor = streams) +} ExeType; + +bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); } +bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); }; +bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); }; +bool IsCpuType(ExeType e) { return (e == EXE_CPU); }; + +char const MemTypeStr[7] = "CGBFUN"; +char const ExeTypeStr[4] = "CGD"; +char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"}; + +MemType inline CharToMemType(char const c) +{ + char const* val = strchr(MemTypeStr, toupper(c)); + if (*val) return (MemType)(val - MemTypeStr); + printf("[ERROR] Unexpected memory type (%c)\n", c); + exit(1); +} + +ExeType inline CharToExeType(char const c) +{ + char const* val = strchr(ExeTypeStr, toupper(c)); + if (*val) return (ExeType)(val - ExeTypeStr); + printf("[ERROR] Unexpected executor type (%c)\n", c); + exit(1); +} + +// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified) +// then writes the summation to each of the specified destination memory location(s) +struct Transfer +{ + int transferIndex; // Transfer identifier (within a Test) + ExeType exeType; // Transfer executor type + int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU) + int numSubExecs; // Number of subExecutors to use for this Transfer + size_t numBytes; // # of bytes requested to Transfer (may be 0 to fallback to default) + size_t numBytesActual; // Actual number of bytes to copy + double transferTime; // Time taken in milliseconds + + int numSrcs; // Number of sources + std::vector srcType; // Source memory types + std::vector srcIndex; // Source device indice + std::vector srcMem; // Source memory + + int numDsts; // Number of destinations + std::vector dstType; // Destination memory type + std::vector dstIndex; // Destination device index + std::vector dstMem; // Destination memory + + std::vector subExecParam; // Defines subarrays assigned to each threadblock + SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam + + // Prepares src/dst subarray pointers for each SubExecutor + void PrepareSubExecParams(EnvVars const& ev); + + // Prepare source arrays with input data + bool PrepareSrc(EnvVars const& ev); + + // Validate that destination data contains expected results + void ValidateDst(EnvVars const& ev); + + // Prepare reference buffers + void PrepareReference(EnvVars const& ev, std::vector& buffer, int bufferIdx); + + // String representation functions + std::string SrcToStr() const; + std::string DstToStr() const; +}; + +struct ExecutorInfo +{ + std::vector transfers; // Transfers to execute + size_t totalBytes; // Total bytes this executor transfers + int totalSubExecs; // Total number of subExecutors to use + + // For GPU-Executors + SubExecParam* subExecParamGpu; // GPU copy of subExecutor parameters + std::vector streams; + std::vector startEvents; + std::vector stopEvents; + + // Results + double totalTime; +}; + +typedef std::pair Executor; +typedef std::map TransferMap; + +// Display usage instructions +void DisplayUsage(char const* cmdName); + +// Display detected GPU topology / CPU numa nodes +void DisplayTopology(bool const outputToCsv); + +// Build array of test sizes based on sampling factor +void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor, + std::vector& valuesofN); + +void ParseMemType(std::string const& token, int const numCpus, int const numGpus, + std::vector& memType, std::vector& memIndex); +void ParseExeType(std::string const& token, int const numCpus, int const numGpus, + ExeType& exeType, int& exeIndex); + +void ParseTransfers(char* line, int numCpus, int numGpus, + std::vector& transfers); + +void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N, + std::vector& transfers, bool verbose = true); + +void EnablePeerAccess(int const deviceId, int const peerDeviceId); +void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr); +void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0); +void CheckPages(char* byteArray, size_t numBytes, int targetId); +void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx); +void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N); +void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom); + +// Return the maximum bandwidth measured for given (src/dst) pair +double GetPeakBandwidth(EnvVars const& ev, size_t const N, + int const isBidirectional, + MemType const srcType, int const srcIndex, + MemType const dstType, int const dstIndex); + +std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount); + +int RemappedIndex(int const origIdx, bool const isCpuType); +int GetWallClockRate(int deviceId); +void LogTransfers(FILE *fp, int const testNum, std::vector const& transfers); +std::string PtrVectorToStr(std::vector const& strVector, int const initOffset); diff --git a/transferbench.so/include/action.h b/transferbench.so/include/action.h new file mode 100644 index 00000000..4c2518a5 --- /dev/null +++ b/transferbench.so/include/action.h @@ -0,0 +1,86 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef HIPTEST_SO_INCLUDE_ACTION_H_ +#define HIPTEST_SO_INCLUDE_ACTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif +#include +#ifdef __cplusplus +} +#endif + +#include +#include +#include + +#include "include/rvsactionbase.h" + +using std::vector; +using std::string; +using std::map; + +/** + * @class tfb_action + * @ingroup GST + * + * @brief GST action implementation class + * + * Derives from rvs::actionbase and implements actual action functionality + * in its run() method. + * + */ +class tfb_action: public rvs::actionbase { + public: + tfb_action(); + virtual ~tfb_action(); + + virtual int run(void); + + std::string m_transfer_type; + + protected: + //! TRUE if JSON output is required + bool bjson; + + bool get_all_tfb_config_keys(void); + /** + * @brief reads all common configuration keys from + * the module's properties collection + * @return true if no fatal error occured, false otherwise + */ + bool get_all_common_config_keys(void); + + int get_num_amd_gpu_devices(void); + bool start_tfb_runners(); + /** + * @brief gets the number of ROCm compatible AMD GPUs + * @return run number of GPUs + */ + int run_hip_tests(); +}; + +#endif // HIPTEST_SO_INCLUDE_ACTION_H_ diff --git a/transferbench.so/include/rvs_module.h b/transferbench.so/include/rvs_module.h new file mode 100644 index 00000000..9edaec65 --- /dev/null +++ b/transferbench.so/include/rvs_module.h @@ -0,0 +1,31 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef HIPTEST_SO_INCLUDE_RVS_MODULE_H_ +#define HIPTEST_SO_INCLUDE_RVS_MODULE_H_ + +#include "include/rvsliblog.h" + + +#endif // HIPTEST_SO_INCLUDE_RVS_MODULE_H_ diff --git a/transferbench.so/include/tfb_worker.h b/transferbench.so/include/tfb_worker.h new file mode 100644 index 00000000..6eeda99a --- /dev/null +++ b/transferbench.so/include/tfb_worker.h @@ -0,0 +1,74 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_ +#define HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_ + +#include +#include +#include "include/rvsthreadbase.h" +#include "include/rvs_blas.h" +#include "include/rvs_util.h" + +#define GST_RESULT_PASS_MESSAGE "true" +#define GST_RESULT_FAIL_MESSAGE "false" + + +/** + * @class tfbWorker + * @ingroup GST + * + * @brief tfbWorker action implementation class + * + * Derives from rvs::ThreadBase and implements actual action functionality + * in its run() method. + * + */ +class tfbWorker : public rvs::ThreadBase { + public: + tfbWorker(); + virtual ~tfbWorker(); + + //! sets action name + void set_name(const std::string& name) { action_name = name; } + //! returns action name + const std::string& get_name(void) { return action_name; } + + //! sets test path + void set_path(std::string pathname) { m_test_path = pathname; } + void set_args(std::string args) { m_test_args = args; } + + const std::string& get_path(void) { return m_test_path; } + bool start_hip_tests(int &error, std::string &errdesc); + protected: + virtual void run(void); + protected: + //! name of the action + std::string action_name; + //! path to execute test + std::string m_test_path; + std::string m_test_args; +}; + +#endif // HIPTEST_SO_INCLUDE_HIPTEST_WORKER_H_ diff --git a/transferbench.so/src/.gitignore b/transferbench.so/src/.gitignore new file mode 100644 index 00000000..6677c873 --- /dev/null +++ b/transferbench.so/src/.gitignore @@ -0,0 +1 @@ +/libmain.cpp diff --git a/transferbench.so/src/action.cpp b/transferbench.so/src/action.cpp new file mode 100644 index 00000000..50c721c5 --- /dev/null +++ b/transferbench.so/src/action.cpp @@ -0,0 +1,215 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "include/action.h" + +#include +#include +#include +#include +#include +#include +#include + +#define __HIP_PLATFORM_HCC__ + +#include "include/rvs_key_def.h" +#include "include/tfb_worker.h" +#include "include/rvsactionbase.h" +#include "include/rvsloglp.h" + +using std::string; +using std::vector; +using std::map; +using std::regex; + +#define RVS_CONF_TFB_TYPE_KEY "transfer-type" +#define TFB_DEFAULT_TYPE "p2p" +#define TFB_NO_COMPATIBLE_GPUS "No AMD compatible GPU found!" + +#define FLOATING_POINT_REGEX "^[0-9]*\\.?[0-9]+$" + +#define JSON_CREATE_NODE_ERROR "JSON cannot create node" + +/** + * @brief default class constructor + */ +tfb_action::tfb_action() { + bjson = false; +} + +/** + * @brief class destructor + */ +tfb_action::~tfb_action() { + property.clear(); +} + +/** + * @brief runs the hip test session + * @return true if no error occured, false otherwise + */ +bool tfb_action::start_tfb_runners() { + size_t k = 0; + // one worker sufficient, as test runner + hipTestWorker worker; + worker.set_name(action_name); + worker.set_path(m_transfer_type); + worker.start(); + worker.join(); + + return rvs::lp::Stopping() ? false : true; +} + +/** + * @brief reads all GST-related configuration keys from + * the module's properties collection + * @return true if no fatal error occured, false otherwise + */ +bool tfb_action::get_all_tfb_config_keys(void) { + int error; + string msg; + bool bsts = true; + + if (property_get(RVS_CONF_TFB_TYPE_KEY, &m_transfer_type, + TFB_DEFAULT_TYPE)) { + msg = "invalid '" + + std::string(RVS_CONF_GST_OPS_TYPE) + "' key value"; + rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name); + bsts = false; + } + + return bsts; +} + +/** + * @brief reads all common configuration keys from + * the module's properties collection + * @return true if no fatal error occured, false otherwise + */ +bool tfb_action::get_all_common_config_keys(void) { + string msg, sdevid, sdev; + bsts = false; + } + + return bsts; +} + +/** + * @brief reads all common configuration keys from + * the module's properties collection + * @return true if no fatal error occured, false otherwise + */ +bool tfb_action::get_all_common_config_keys(void) { + string msg, sdevid, sdev; + int error; + bool bsts = true; + + + // place holder for later + + return bsts; +} + +/** + * @brief gets the number of ROCm compatible AMD GPUs + * @return run number of GPUs + */ +int tfb_action::get_num_amd_gpu_devices(void) { + int hip_num_gpu_devices; + string msg; + + hipGetDeviceCount(&hip_num_gpu_devices); + if (hip_num_gpu_devices == 0) { // no AMD compatible GPU + msg = action_name + " " + MODULE_NAME + " " + HIPTEST_NO_COMPATIBLE_GPUS; + rvs::lp::Log(msg, rvs::logerror); + + if (bjson) { + unsigned int sec; + unsigned int usec; + rvs::lp::get_ticks(&sec, &usec); + void *json_root_node = rvs::lp::LogRecordCreate(MODULE_NAME, + action_name.c_str(), rvs::loginfo, sec, usec, true); + if (!json_root_node) { + // log the error + string msg = std::string(JSON_CREATE_NODE_ERROR); + rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name); + return -1; + } + + rvs::lp::AddString(json_root_node, "ERROR", HIPTEST_NO_COMPATIBLE_GPUS); + rvs::lp::LogRecordFlush(json_root_node, rvs::loginfo); + } + return 0; + } + return hip_num_gpu_devices; +} + +/** + * @brief gets all selected GPUs and starts the worker threads + * @return run result + */ +int tfb_action::run_transferbench(void) { + int hip_num_gpu_devices; + std::string msg; + + hip_num_gpu_devices = get_num_amd_gpu_devices(); + + if (hip_num_gpu_devices > 0) { + if (start_tfb_runners()) + return 0; + + return -1; + } else { + msg = "No devices match criteria from the test configuation."; + rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name); + return -1; + } + + return 0; +} + +/** + * @brief runs the whole GST logic + * @return run result + */ +int tfb_action::run(void) { + string msg; + + // get the action name + if (property_get(RVS_CONF_NAME_KEY, &action_name)) { + rvs::lp::Err("Action name missing", MODULE_NAME_CAPS); + return -1; + } + + // check for -j flag (json logging) + if (property.find("cli.-j") != property.end()) + bjson = true; + if (!get_all_common_config_keys()) + return -1; + if (!get_all_tfb_config_keys()) + return -1; + + return run_transferbench(); +} diff --git a/transferbench.so/src/hiptest_worker.cpp b/transferbench.so/src/hiptest_worker.cpp new file mode 100644 index 00000000..9a31fc32 --- /dev/null +++ b/transferbench.so/src/hiptest_worker.cpp @@ -0,0 +1,96 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "include/hiptest_worker.h" + +#include +#include +#include +#include +#include "include/rvs_blas.h" +#include "include/rvs_module.h" +#include "include/rvsloglp.h" +#include "include/rvs_util.h" + +#include +#include +#include +#include + +#define MODULE_NAME "hiptest" + +using std::string; + +//bool hipTestWorker::bjson = false; + +hipTestWorker::hipTestWorker() {} +hipTestWorker::~hipTestWorker() {} + + +/** + * @brief performs the stress test on the given GPU + */ +void hipTestWorker::run() { + string msg, err_description; + int error = 0; + + + // log GST stress test - start message + msg = "[" + action_name + "] " + MODULE_NAME + " " + + " Starting the Hip test "; + rvs::lp::Log(msg, rvs::logtrace); + + // let the GPU ramp-up and check the result + bool hipsuccess = start_hip_tests(error, err_description); + + // GPU was not able to do the processing (HIP/rocBlas error(s) occurred) + if (error) { + string msg = "[" + action_name + "] " + MODULE_NAME + " " + + err_description; + rvs::lp::Log(msg, rvs::logerror); + return; + } + +} + +/** + * @brief forks and execs test result + * @param return true if test succeeded, false otherwise + */ + +bool hipTestWorker::start_hip_tests(int &error, string &errdesc){ + int pid, status; + auto found = m_test_path.find_last_of('/'); + auto fname = m_test_path.substr(found+1); + if((pid = fork()) == 0){ // child + execl(m_test_path.c_str(), fname.c_str(), m_test_args.c_str(), 0); + }else{ + waitpid(pid, &status, 0); + if (WIFEXITED(status)){ + error = 0; + return true; + } + return false; + } +} diff --git a/transferbench.so/src/rvs_module.cpp b/transferbench.so/src/rvs_module.cpp new file mode 100644 index 00000000..3fbec70c --- /dev/null +++ b/transferbench.so/src/rvs_module.cpp @@ -0,0 +1,91 @@ +/******************************************************************************** + * + * Copyright (c) 2018 ROCm Developer Tools + * + * MIT LICENSE: + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "include/rvs_module.h" +#include "include/action.h" +#include "include/rvsloglp.h" +#include "include/gpu_util.h" + +/** + * @defgroup GST GST Module + * + * @brief performs GPU Stress Test + * + * The GPU Stress Test runs a Graphics Stress test or SGEMM/DGEMM + * (Single/Double-precision General Matrix Multiplication) workload + * on one, some or all GPUs. The GPUs can be of the same or different types. + * The duration of the benchmark should be configurable, both in terms of time + * (how long to run) and iterations (how many times to run). + * + */ + +extern "C" int rvs_module_has_interface(int iid) { + int sts = 0; + switch (iid) { + case 0: + case 1: + sts = 1; + } + return sts; +} + +extern "C" const char* rvs_module_get_description(void) { + return "ROCm Validation Suite HipTest module"; +} + +extern "C" const char* rvs_module_get_config(void) { + return "test_path (string)" ; +} + +extern "C" const char* rvs_module_get_output(void) { + return "pass (bool)"; +} + +extern "C" int rvs_module_init(void* pMi) { + rvs::lp::Initialize(static_cast(pMi)); + rvs::gpulist::Initialize(); + return 0; +} + +extern "C" int rvs_module_terminate(void) { + return 0; +} + +extern "C" void* rvs_module_action_create(void) { + return static_cast(new hiptest_action); +} + +extern "C" int rvs_module_action_destroy(void* pAction) { + delete static_cast(pAction); + return 0; +} + +extern "C" int rvs_module_action_property_set(void* pAction, const char* Key, + const char* Val) { + return static_cast(pAction)->property_set(Key, Val); +} + +extern "C" int rvs_module_action_run(void* pAction) { + return static_cast(pAction)->run(); +} diff --git a/transferbench.so/src/tfb_worker.cpp b/transferbench.so/src/tfb_worker.cpp new file mode 100644 index 00000000..75b926d9 --- /dev/null +++ b/transferbench.so/src/tfb_worker.cpp @@ -0,0 +1,1953 @@ +/* +Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// This program measures simultaneous copy performance across multiple GPUs +// on the same node +#include +#include +#include +#include +#include + +#include "TransferBench.hpp" +#include "GetClosestNumaNode.hpp" + + +/** + * @brief performs the stress test on the given GPU + */ +void tfbWorker::run() { + string msg, err_description; + int error = 0; + + + // log GST stress test - start message + msg = "[" + action_name + "] " + MODULE_NAME + " " + + " Starting the Hip test "; + rvs::lp::Log(msg, rvs::logtrace); + + // let the GPU ramp-up and check the result + int sts = TfbRun(error, err_description); + + // GPU was not able to do the processing (HIP/rocBlas error(s) occurred) + if (error) { + string msg = "[" + action_name + "] " + MODULE_NAME + " " + + err_description; + rvs::lp::Log(msg, rvs::logerror); + return; + } + +} + + +int TfbRun(int &err, string &errmsg) +{ + // Check for NUMA library support + if (numa_available() == -1) + { + printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n"); + exit(1); + } + + // Display usage instructions and detected topology + if (argc <= 1) + { + int const outputToCsv = EnvVars::GetEnvVar("OUTPUT_TO_CSV", 0); + if (!outputToCsv) DisplayUsage(argv[0]); + DisplayTopology(outputToCsv); + exit(0); + } + + // Collect environment variables / display current run configuration + EnvVars ev; + + // Determine number of bytes to run per Transfer + size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER; + if (argc > 2) + { + // Adjust bytes if unit specified + char units = argv[2][strlen(argv[2])-1]; + switch (units) + { + case 'K': case 'k': numBytesPerTransfer *= 1024; break; + case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break; + case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break; + } + } + if (numBytesPerTransfer % 4) + { + printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer); + exit(1); + } + + // Check for preset tests + // - Tests that sweep across possible sets of Transfers + if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep")) + { + int numGpuSubExecs = (argc > 3 ? atoi(argv[3]) : 4); + int numCpuSubExecs = (argc > 4 ? atoi(argv[4]) : 4); + + ev.configMode = CFG_SWEEP; + RunSweepPreset(ev, numBytesPerTransfer, numGpuSubExecs, numCpuSubExecs, !strcmp(argv[1], "rsweep")); + exit(0); + } + // - Tests that benchmark peer-to-peer performance + else if (!strcmp(argv[1], "p2p")) + { + ev.configMode = CFG_P2P; + RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float)); + exit(0); + } + + // Check that Transfer configuration file can be opened + ev.configMode = CFG_FILE; + FILE* fp = fopen(argv[1], "r"); + if (!fp) + { + printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]); + exit(1); + } + + // Print environment variables and CSV header + ev.DisplayEnvVars(); + if (ev.outputToCsv) + { + printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n"); + } + + int testNum = 0; + char line[2048]; + while(fgets(line, 2048, fp)) + { + // Check if line is a comment to be echoed to output (starts with ##) + if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line); + + // Parse set of parallel Transfers to execute + std::vector transfers; + ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers); + if (transfers.empty()) continue; + + // If the number of bytes is specified, use it + if (numBytesPerTransfer != 0) + { + size_t N = numBytesPerTransfer / sizeof(float); + ExecuteTransfers(ev, ++testNum, N, transfers); + } + else + { + // Otherwise generate a range of values + for (int N = 256; N <= (1<<27); N *= 2) + { + int delta = std::max(1, N / ev.samplingFactor); + int curr = N; + while (curr < N * 2) + { + ExecuteTransfers(ev, ++testNum, curr, transfers); + curr += delta; + } + } + } + } + fclose(fp); + + return 0; +} + +void ExecuteTransfers(EnvVars const& ev, + int const testNum, + size_t const N, + std::vector& transfers, + bool verbose) +{ + int const initOffset = ev.byteOffset / sizeof(float); + + // Map transfers by executor + TransferMap transferMap; + for (Transfer& transfer : transfers) + { + Executor executor(transfer.exeType, transfer.exeIndex); + ExecutorInfo& executorInfo = transferMap[executor]; + executorInfo.transfers.push_back(&transfer); + } + + // Loop over each executor and prepare sub-executors + std::map transferList; + for (auto& exeInfoPair : transferMap) + { + Executor const& executor = exeInfoPair.first; + ExecutorInfo& exeInfo = exeInfoPair.second; + ExeType const exeType = executor.first; + int const exeIndex = RemappedIndex(executor.second, IsCpuType(exeType)); + + exeInfo.totalTime = 0.0; + exeInfo.totalSubExecs = 0; + + // Loop over each transfer this executor is involved in + for (Transfer* transfer : exeInfo.transfers) + { + // Determine how many bytes to copy for this Transfer (use custom if pre-specified) + transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float)); + + // Allocate source memory + transfer->srcMem.resize(transfer->numSrcs); + for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc) + { + MemType const& srcType = transfer->srcType[iSrc]; + int const srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType)); + + // Ensure executing GPU can access source memory + if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex) + EnablePeerAccess(exeIndex, srcIndex); + + AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]); + } + + // Allocate destination memory + transfer->dstMem.resize(transfer->numDsts); + for (int iDst = 0; iDst < transfer->numDsts; ++iDst) + { + MemType const& dstType = transfer->dstType[iDst]; + int const dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType)); + + // Ensure executing GPU can access destination memory + if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex) + EnablePeerAccess(exeIndex, dstIndex); + + AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]); + } + + exeInfo.totalSubExecs += transfer->numSubExecs; + transferList[transfer->transferIndex] = transfer; + } + + // Prepare additional requirement for GPU-based executors + if (IsGpuType(exeType)) + { + HIP_CALL(hipSetDevice(exeIndex)); + + // Single-stream is only supported for GFX-based executors + int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1; + exeInfo.streams.resize(numStreamsToUse); + exeInfo.startEvents.resize(numStreamsToUse); + exeInfo.stopEvents.resize(numStreamsToUse); + for (int i = 0; i < numStreamsToUse; ++i) + { + HIP_CALL(hipStreamCreate(&exeInfo.streams[i])); + HIP_CALL(hipEventCreate(&exeInfo.startEvents[i])); + HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i])); + } + + if (exeType == EXE_GPU_GFX) + { + // Allocate one contiguous chunk of GPU memory for threadblock parameters + // This allows support for executing one transfer per stream, or all transfers in a single stream + AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam), + (void**)&exeInfo.subExecParamGpu); + } + } + } + + if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum); + + // Prepare input memory and block parameters for current N + bool isSrcCorrect = true; + for (auto& exeInfoPair : transferMap) + { + Executor const& executor = exeInfoPair.first; + ExecutorInfo& exeInfo = exeInfoPair.second; + ExeType const exeType = executor.first; + int const exeIndex = RemappedIndex(executor.second, IsCpuType(exeType)); + + exeInfo.totalBytes = 0; + + int transferOffset = 0; + for (int i = 0; i < exeInfo.transfers.size(); ++i) + { + // Prepare subarrays each threadblock works on and fill src memory with patterned data + Transfer* transfer = exeInfo.transfers[i]; + transfer->PrepareSubExecParams(ev); + isSrcCorrect &= transfer->PrepareSrc(ev); + exeInfo.totalBytes += transfer->numBytesActual; + + // Copy block parameters to GPU for GPU executors + if (transfer->exeType == EXE_GPU_GFX) + { + exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset; + HIP_CALL(hipSetDevice(exeIndex)); + HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset], + transfer->subExecParam.data(), + transfer->subExecParam.size() * sizeof(SubExecParam), + hipMemcpyHostToDevice)); + HIP_CALL(hipDeviceSynchronize()); + + transferOffset += transfer->subExecParam.size(); + } + } + } + + // Launch kernels (warmup iterations are not counted) + double totalCpuTime = 0; + size_t numTimedIterations = 0; + std::stack threads; + for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++) + { + if (ev.numIterations > 0 && iteration >= ev.numIterations) break; + if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break; + + // Pause before starting first timed iteration in interactive mode + if (verbose && ev.useInteractive && iteration == 0) + { + printf("Memory prepared:\n"); + + for (Transfer& transfer : transfers) + { + printf("Transfer %03d:\n", transfer.transferIndex); + for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc) + printf(" SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]); + for (int iDst = 0; iDst < transfer.numDsts; ++iDst) + printf(" DST %0d: %p\n", iDst, transfer.dstMem[iDst]); + } + printf("Hit to continue: "); + if (scanf("%*c") != 0) + { + printf("[ERROR] Unexpected input\n"); + exit(1); + } + printf("\n"); + } + + // Start CPU timing for this iteration + auto cpuStart = std::chrono::high_resolution_clock::now(); + + // Execute all Transfers in parallel + for (auto& exeInfoPair : transferMap) + { + ExecutorInfo& exeInfo = exeInfoPair.second; + ExeType exeType = exeInfoPair.first.first; + int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size(); + + for (int i = 0; i < numTransfersToRun; ++i) + threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i)); + } + + // Wait for all threads to finish + int const numTransfers = threads.size(); + for (int i = 0; i < numTransfers; i++) + { + threads.top().join(); + threads.pop(); + } + + // Stop CPU timing for this iteration + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + double deltaSec = std::chrono::duration_cast>(cpuDelta).count(); + + if (iteration >= 0) + { + ++numTimedIterations; + totalCpuTime += deltaSec; + } + } + + // Pause for interactive mode + if (verbose && isSrcCorrect && ev.useInteractive) + { + printf("Transfers complete. Hit to continue: "); + if (scanf("%*c") != 0) + { + printf("[ERROR] Unexpected input\n"); + exit(1); + } + printf("\n"); + } + + // Validate that each transfer has transferred correctly + size_t totalBytesTransferred = 0; + int const numTransfers = transferList.size(); + for (auto transferPair : transferList) + { + Transfer* transfer = transferPair.second; + transfer->ValidateDst(ev); + totalBytesTransferred += transfer->numBytesActual; + } + + // Report timings + totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000; + double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime; + double maxGpuTime = 0; + + if (!isSrcCorrect) goto cleanup; + if (ev.useSingleStream) + { + for (auto& exeInfoPair : transferMap) + { + ExecutorInfo exeInfo = exeInfoPair.second; + ExeType const exeType = exeInfoPair.first.first; + int const exeIndex = exeInfoPair.first.second; + + // Compute total time for non GPU executors + if (exeType != EXE_GPU_GFX) + { + exeInfo.totalTime = 0; + for (auto const& transfer : exeInfo.transfers) + exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime); + } + + double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations); + double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f; + maxGpuTime = std::max(maxGpuTime, exeDurationMsec); + + if (verbose && !ev.outputToCsv) + { + printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n", + ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes); + } + + int totalCUs = 0; + for (auto const& transfer : exeInfo.transfers) + { + double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations); + double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f; + totalCUs += transfer->numSubExecs; + + if (!verbose) continue; + if (!ev.outputToCsv) + { + printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", + transfer->transferIndex, + transferBandwidthGbs, + transferDurationMsec, + transfer->numBytesActual, + transfer->SrcToStr().c_str(), + ExeTypeName[transfer->exeType], transfer->exeIndex, + transfer->numSubExecs, + transfer->DstToStr().c_str()); + } + else + { + printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n", + testNum, transfer->transferIndex, transfer->numBytesActual, + transfer->SrcToStr().c_str(), + MemTypeStr[transfer->exeType], transfer->exeIndex, + transfer->DstToStr().c_str(), + transfer->numSubExecs, + transferBandwidthGbs, transferDurationMsec, + PtrVectorToStr(transfer->srcMem, initOffset).c_str(), + PtrVectorToStr(transfer->dstMem, initOffset).c_str()); + } + } + + if (verbose && ev.outputToCsv) + { + printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n", + testNum, totalBytesTransferred, + MemTypeStr[exeType], exeIndex, totalCUs, + exeBandwidthGbs, exeDurationMsec); + } + } + } + else + { + for (auto const& transferPair : transferList) + { + Transfer* transfer = transferPair.second; + double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations); + double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f; + maxGpuTime = std::max(maxGpuTime, transferDurationMsec); + if (!verbose) continue; + if (!ev.outputToCsv) + { + printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", + transfer->transferIndex, + transferBandwidthGbs, transferDurationMsec, + transfer->numBytesActual, + transfer->SrcToStr().c_str(), + ExeTypeName[transfer->exeType], transfer->exeIndex, + transfer->numSubExecs, + transfer->DstToStr().c_str()); + } + else + { + printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n", + testNum, transfer->transferIndex, transfer->numBytesActual, + transfer->SrcToStr().c_str(), + ExeTypeName[transfer->exeType], transfer->exeIndex, + transfer->DstToStr().c_str(), + transfer->numSubExecs, + transferBandwidthGbs, transferDurationMsec, + PtrVectorToStr(transfer->srcMem, initOffset).c_str(), + PtrVectorToStr(transfer->dstMem, initOffset).c_str()); + } + } + } + + // Display aggregate statistics + if (verbose) + { + if (!ev.outputToCsv) + { + printf(" Aggregate (CPU) | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n", + totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime); + } + else + { + printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n", + testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime); + } + } + + // Release GPU memory +cleanup: + for (auto exeInfoPair : transferMap) + { + ExecutorInfo& exeInfo = exeInfoPair.second; + ExeType const exeType = exeInfoPair.first.first; + int const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType)); + + for (auto& transfer : exeInfo.transfers) + { + for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc) + { + MemType const& srcType = transfer->srcType[iSrc]; + DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset); + } + for (int iDst = 0; iDst < transfer->numDsts; ++iDst) + { + MemType const& dstType = transfer->dstType[iDst]; + DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset); + } + transfer->subExecParam.clear(); + } + + if (IsGpuType(exeType)) + { + int const numStreams = (int)exeInfo.streams.size(); + for (int i = 0; i < numStreams; ++i) + { + HIP_CALL(hipEventDestroy(exeInfo.startEvents[i])); + HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i])); + HIP_CALL(hipStreamDestroy(exeInfo.streams[i])); + } + + if (exeType == EXE_GPU_GFX) + { + DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu); + } + } + } +} + +void DisplayUsage(char const* cmdName) +{ + printf("TransferBench v%s\n", TB_VERSION); + printf("========================================\n"); + + if (numa_available() == -1) + { + printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n"); + exit(1); + } + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + int const numCpuDevices = numa_num_configured_nodes(); + + printf("Usage: %s config \n", cmdName); + printf(" config: Either:\n"); + printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n"); + printf(" - Name of preset config:\n"); + printf(" p2p - Peer-to-peer benchmark tests\n"); + printf(" sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n"); + printf(" - 3rd/4th optional args for # GPU SubExecs / # CPU SubExecs per Transfer\n"); + printf(" N : (Optional) Number of bytes to copy per Transfer.\n"); + printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", + DEFAULT_BYTES_PER_TRANSFER); + printf(" If 0 is specified, a range of Ns will be benchmarked\n"); + printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n"); + printf("\n"); + + EnvVars::DisplayUsage(); +} + +int RemappedIndex(int const origIdx, bool const isCpuType) +{ + static std::vector remappingCpu; + static std::vector remappingGpu; + + // Build CPU remapping on first use + // Skip numa nodes that are not configured + if (remappingCpu.empty()) + { + for (int node = 0; node <= numa_max_node(); node++) + if (numa_bitmask_isbitset(numa_get_mems_allowed(), node)) + remappingCpu.push_back(node); + } + + // Build remappingGpu on first use + if (remappingGpu.empty()) + { + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + remappingGpu.resize(numGpuDevices); + + int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0; + if (!usePcieIndexing) + { + // For HIP-based indexing no remappingGpu is necessary + for (int i = 0; i < numGpuDevices; ++i) + remappingGpu[i] = i; + } + else + { + // Collect PCIe address for each GPU + std::vector> mapping; + char pciBusId[20]; + for (int i = 0; i < numGpuDevices; ++i) + { + HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i)); + mapping.push_back(std::make_pair(pciBusId, i)); + } + // Sort GPUs by PCIe address then use that as mapping + std::sort(mapping.begin(), mapping.end()); + for (int i = 0; i < numGpuDevices; ++i) + remappingGpu[i] = mapping[i].second; + } + } + return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx]; +} + +void DisplayTopology(bool const outputToCsv) +{ + + int numCpuDevices = numa_num_configured_nodes(); + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + + if (outputToCsv) + { + printf("NumCpus,%d\n", numCpuDevices); + printf("NumGpus,%d\n", numGpuDevices); + } + else + { + printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total] %d GPU device(s)\n", + numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices); + } + + // Print out detected CPU topology + if (outputToCsv) + { + printf("NUMA"); + for (int j = 0; j < numCpuDevices; j++) + printf(",NUMA%02d", j); + printf(",# CPUs,ClosestGPUs,ActualNode\n"); + } + else + { + printf(" |"); + for (int j = 0; j < numCpuDevices; j++) + printf("NUMA %02d|", j); + printf(" #Cpus | Closest GPU(s)\n"); + + printf("------------+"); + for (int j = 0; j <= numCpuDevices; j++) + printf("-------+"); + printf("---------------\n"); + } + + for (int i = 0; i < numCpuDevices; i++) + { + int nodeI = RemappedIndex(i, true); + printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|"); + for (int j = 0; j < numCpuDevices; j++) + { + int nodeJ = RemappedIndex(j, true); + int numaDist = numa_distance(nodeI, nodeJ); + if (outputToCsv) + printf("%d,", numaDist); + else + printf(" %5d |", numaDist); + } + + int numCpus = 0; + for (int j = 0; j < numa_num_configured_cpus(); j++) + if (numa_node_of_cpu(j) == nodeI) numCpus++; + if (outputToCsv) + printf("%d,", numCpus); + else + printf(" %5d | ", numCpus); + +#if !defined(__NVCC__) + bool isFirst = true; + for (int j = 0; j < numGpuDevices; j++) + { + if (GetClosestNumaNode(RemappedIndex(j, false)) == i) + { + if (isFirst) isFirst = false; + else printf(","); + printf("%d", j); + } + } +#endif + printf("\n"); + } + printf("\n"); + +#if defined(__NVCC__) + // No further topology detection done for NVIDIA platforms + return; +#endif + + // Print out detected GPU topology + if (outputToCsv) + { + printf("GPU"); + for (int j = 0; j < numGpuDevices; j++) + printf(",GPU %02d", j); + printf(",PCIe Bus ID,ClosestNUMA\n"); + } + else + { + printf(" |"); + for (int j = 0; j < numGpuDevices; j++) + { + hipDeviceProp_t prop; + HIP_CALL(hipGetDeviceProperties(&prop, j)); + std::string fullName = prop.gcnArchName; + std::string archName = fullName.substr(0, fullName.find(':')); + printf(" %6s |", archName.c_str()); + } + printf("\n"); + printf(" |"); + for (int j = 0; j < numGpuDevices; j++) + printf(" GPU %02d |", j); + printf(" PCIe Bus ID | #CUs | Closest NUMA\n"); + for (int j = 0; j <= numGpuDevices; j++) + printf("--------+"); + printf("--------------+------+-------------\n"); + } + +#if !defined(__NVCC__) + char pciBusId[20]; + for (int i = 0; i < numGpuDevices; i++) + { + int const deviceIdx = RemappedIndex(i, false); + printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |"); + for (int j = 0; j < numGpuDevices; j++) + { + if (i == j) + { + if (outputToCsv) + printf("-,"); + else + printf(" - |"); + } + else + { + uint32_t linkType, hopCount; + HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx, + RemappedIndex(j, false), + &linkType, &hopCount)); + printf("%s%s-%d%s", + outputToCsv ? "" : " ", + linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT" : + linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI" : + linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE" : + linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFB" : + linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI" : "????", + hopCount, outputToCsv ? "," : " |"); + } + } + HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx)); + + int numDeviceCUs = 0; + HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx)); + + if (outputToCsv) + printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx)); + else + printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx)); + } +#endif +} + +void ParseMemType(std::string const& token, int const numCpus, int const numGpus, + std::vector& memTypes, std::vector& memIndices) +{ + char typeChar; + int offset = 0, devIndex, inc; + bool found = false; + + memTypes.clear(); + memIndices.clear(); + while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2) + { + offset += inc; + MemType memType = CharToMemType(typeChar); + + if (IsCpuType(memType) && (devIndex < 0 || devIndex >= numCpus)) + { + printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, devIndex); + exit(1); + } + if (IsGpuType(memType) && (devIndex < 0 || devIndex >= numGpus)) + { + printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, devIndex); + exit(1); + } + + found = true; + if (memType != MEM_NULL) + { + memTypes.push_back(memType); + memIndices.push_back(devIndex); + } + } + if (!found) + { + printf("[ERROR] Unable to parse memory type token %s. Expected one of %s followed by an index\n", + token.c_str(), MemTypeStr); + exit(1); + } +} + +void ParseExeType(std::string const& token, int const numCpus, int const numGpus, + ExeType &exeType, int& exeIndex) +{ + char typeChar; + if (sscanf(token.c_str(), " %c%d", &typeChar, &exeIndex) != 2) + { + printf("[ERROR] Unable to parse valid executor token (%s). Exepected one of %s followed by an index\n", + token.c_str(), ExeTypeStr); + exit(1); + } + exeType = CharToExeType(typeChar); + + if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= numCpus)) + { + printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, exeIndex); + exit(1); + } + if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= numGpus)) + { + printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, exeIndex); + exit(1); + } +} + +// Helper function to parse a list of Transfer definitions +void ParseTransfers(char* line, int numCpus, int numGpus, std::vector& transfers) +{ + // Replace any round brackets or '->' with spaces, + for (int i = 1; line[i]; i++) + if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' '; + + transfers.clear(); + + int numTransfers = 0; + std::istringstream iss(line); + iss >> numTransfers; + if (iss.fail()) return; + + std::string exeMem; + std::string srcMem; + std::string dstMem; + + // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes) + // otherwise read triples (srcMem, exeMem, dstMem) + bool const advancedMode = (numTransfers < 0); + numTransfers = abs(numTransfers); + + int numSubExecs; + if (!advancedMode) + { + iss >> numSubExecs; + if (numSubExecs <= 0 || iss.fail()) + { + printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs); + exit(1); + } + } + + size_t numBytes = 0; + for (int i = 0; i < numTransfers; i++) + { + Transfer transfer; + transfer.transferIndex = i; + transfer.numBytes = 0; + transfer.numBytesActual = 0; + if (!advancedMode) + { + iss >> srcMem >> exeMem >> dstMem; + if (iss.fail()) + { + printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1); + exit(1); + } + } + else + { + std::string numBytesToken; + iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken; + if (iss.fail()) + { + printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1); + exit(1); + } + if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1) + { + printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1); + exit(1); + } + char units = numBytesToken.back(); + switch (toupper(units)) + { + case 'K': numBytes *= 1024; break; + case 'M': numBytes *= 1024*1024; break; + case 'G': numBytes *= 1024*1024*1024; break; + } + } + + ParseMemType(srcMem, numCpus, numGpus, transfer.srcType, transfer.srcIndex); + ParseMemType(dstMem, numCpus, numGpus, transfer.dstType, transfer.dstIndex); + ParseExeType(exeMem, numCpus, numGpus, transfer.exeType, transfer.exeIndex); + + transfer.numSrcs = (int)transfer.srcType.size(); + transfer.numDsts = (int)transfer.dstType.size(); + if (transfer.numSrcs == 0 && transfer.numDsts == 0) + { + printf("[ERROR] Transfer must have at least one src or dst\n"); + exit(1); + } + + if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1)) + { + printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n"); + exit(1); + } + + transfer.numSubExecs = numSubExecs; + transfer.numBytes = numBytes; + transfers.push_back(transfer); + } +} + +void EnablePeerAccess(int const deviceId, int const peerDeviceId) +{ + int canAccess; + HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId)); + if (!canAccess) + { + printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId); + exit(1); + } + HIP_CALL(hipSetDevice(deviceId)); + hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0); + if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled) + { + printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n", + deviceId, peerDeviceId, hipGetErrorString(error)); + exit(1); + } +} + +void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr) +{ + if (numBytes == 0) + { + printf("[ERROR] Unable to allocate 0 bytes\n"); + exit(1); + } + *memPtr = nullptr; + if (IsCpuType(memType)) + { + // Set numa policy prior to call to hipHostMalloc + numa_set_preferred(devIndex); + + // Allocate host-pinned memory (should respect NUMA mem policy) + if (memType == MEM_CPU_FINE) + { +#if defined (__NVCC__) + printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n"); + exit(1); +#else + HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser)); +#endif + } + else if (memType == MEM_CPU) + { +#if defined (__NVCC__) + if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess) +#else + if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess) +#endif + { + printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex); + exit(1); + } + } + else if (memType == MEM_CPU_UNPINNED) + { + *memPtr = numa_alloc_onnode(numBytes, devIndex); + } + + // Check that the allocated pages are actually on the correct NUMA node + memset(*memPtr, 0, numBytes); + CheckPages((char*)*memPtr, numBytes, devIndex); + + // Reset to default numa mem policy + numa_set_preferred(-1); + } + else if (IsGpuType(memType)) + { + if (memType == MEM_GPU) + { + // Allocate GPU memory on appropriate device + HIP_CALL(hipSetDevice(devIndex)); + HIP_CALL(hipMalloc((void**)memPtr, numBytes)); + } + else if (memType == MEM_GPU_FINE) + { +#if defined (__NVCC__) + printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n"); + exit(1); +#else + HIP_CALL(hipSetDevice(devIndex)); + HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained)); + +#endif + } + HIP_CALL(hipMemset(*memPtr, 0, numBytes)); + HIP_CALL(hipDeviceSynchronize()); + } + else + { + printf("[ERROR] Unsupported memory type %d\n", memType); + exit(1); + } +} + +void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes) +{ + if (memType == MEM_CPU || memType == MEM_CPU_FINE) + { + if (memPtr == nullptr) + { + printf("[ERROR] Attempting to free null CPU pointer for %lu bytes. Skipping hipHostFree\n", bytes); + return; + } + HIP_CALL(hipHostFree(memPtr)); + } + else if (memType == MEM_CPU_UNPINNED) + { + if (memPtr == nullptr) + { + printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes. Skipping numa_free\n", bytes); + return; + } + numa_free(memPtr, bytes); + } + else if (memType == MEM_GPU || memType == MEM_GPU_FINE) + { + if (memPtr == nullptr) + { + printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes); + return; + } + HIP_CALL(hipFree(memPtr)); + } +} + +void CheckPages(char* array, size_t numBytes, int targetId) +{ + unsigned long const pageSize = getpagesize(); + unsigned long const numPages = (numBytes + pageSize - 1) / pageSize; + + std::vector pages(numPages); + std::vector status(numPages); + + pages[0] = array; + for (int i = 1; i < numPages; i++) + { + pages[i] = (char*)pages[i-1] + pageSize; + } + + long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0); + if (retCode) + { + printf("[ERROR] Unable to collect page info\n"); + exit(1); + } + + size_t mistakeCount = 0; + for (int i = 0; i < numPages; i++) + { + if (status[i] < 0) + { + printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i); + exit(1); + } + if (status[i] != targetId) mistakeCount++; + } + if (mistakeCount > 0) + { + printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId); + exit(1); + } +} + +void RunTransfer(EnvVars const& ev, int const iteration, + ExecutorInfo& exeInfo, int const transferIdx) +{ + Transfer* transfer = exeInfo.transfers[transferIdx]; + + if (transfer->exeType == EXE_GPU_GFX) + { + // Switch to executing GPU + int const exeIndex = RemappedIndex(transfer->exeIndex, false); + HIP_CALL(hipSetDevice(exeIndex)); + + hipStream_t& stream = exeInfo.streams[transferIdx]; + hipEvent_t& startEvent = exeInfo.startEvents[transferIdx]; + hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx]; + + // Figure out how many threadblocks to use. + // In single stream mode, all the threadblocks for this GPU are launched + // Otherwise, just launch the threadblocks associated with this single Transfer + int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs; +#if defined(__NVCC__) + HIP_CALL(hipEventRecord(startEvent, stream)); + GpuKernelTable[ev.gpuKernel]<<>>(transfer->subExecParamGpuPtr); + HIP_CALL(hipEventRecord(stopEvent, stream)); +#else + hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel], + dim3(numBlocksToRun, 1, 1), + dim3(BLOCKSIZE, 1, 1), + ev.sharedMemBytes, stream, + startEvent, stopEvent, + 0, transfer->subExecParamGpuPtr); +#endif + // Synchronize per iteration, unless in single sync mode, in which case + // synchronize during last warmup / last actual iteration + HIP_CALL(hipStreamSynchronize(stream)); + + if (iteration >= 0) + { + // Record GPU timing + float gpuDeltaMsec; + HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); + + if (ev.useSingleStream) + { + // Figure out individual timings for Transfers that were all launched together + for (Transfer* currTransfer : exeInfo.transfers) + { + long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle; + long long maxStopCycle = currTransfer->subExecParamGpuPtr[0].stopCycle; + for (int i = 1; i < currTransfer->numSubExecs; i++) + { + minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle); + maxStopCycle = std::max(maxStopCycle, currTransfer->subExecParamGpuPtr[i].stopCycle); + } + int const wallClockRate = GetWallClockRate(exeIndex); + double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate); + currTransfer->transferTime += iterationTimeMs; + } + exeInfo.totalTime += gpuDeltaMsec; + } + else + { + transfer->transferTime += gpuDeltaMsec; + } + } + } + else if (transfer->exeType == EXE_GPU_DMA) + { + // Switch to executing GPU + int const exeIndex = RemappedIndex(transfer->exeIndex, false); + HIP_CALL(hipSetDevice(exeIndex)); + + hipStream_t& stream = exeInfo.streams[transferIdx]; + hipEvent_t& startEvent = exeInfo.startEvents[transferIdx]; + hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx]; + + HIP_CALL(hipEventRecord(startEvent, stream)); + if (transfer->numSrcs == 0 && transfer->numDsts == 1) + { + HIP_CALL(hipMemsetAsync(transfer->dstMem[0], + MEMSET_CHAR, transfer->numBytesActual, stream)); + } + else if (transfer->numSrcs == 1 && transfer->numDsts == 1) + { + HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0], + transfer->numBytesActual, hipMemcpyDefault, + stream)); + } + HIP_CALL(hipEventRecord(stopEvent, stream)); + HIP_CALL(hipStreamSynchronize(stream)); + + if (iteration >= 0) + { + // Record GPU timing + float gpuDeltaMsec; + HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); + transfer->transferTime += gpuDeltaMsec; + } + } + else if (transfer->exeType == EXE_CPU) // CPU execution agent + { + // Force this thread and all child threads onto correct NUMA node + int const exeIndex = RemappedIndex(transfer->exeIndex, true); + if (numa_run_on_node(exeIndex)) + { + printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex); + exit(1); + } + + std::vector childThreads; + + auto cpuStart = std::chrono::high_resolution_clock::now(); + + // Launch each subExecutor in child-threads to perform memcopies + for (int i = 0; i < transfer->numSubExecs; ++i) + childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i]))); + + // Wait for child-threads to finish + for (int i = 0; i < transfer->numSubExecs; ++i) + childThreads[i].join(); + + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + + // Record time if not a warmup iteration + if (iteration >= 0) + transfer->transferTime += (std::chrono::duration_cast>(cpuDelta).count() * 1000.0); + } +} + +void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) +{ + ev.DisplayP2PBenchmarkEnvVars(); + + // Collect the number of available CPUs/GPUs on this machine + int const numCpus = ev.numCpuDevices; + int const numGpus = ev.numGpuDevices; + int const numDevices = numCpus + numGpus; + + // Enable peer to peer for each GPU + for (int i = 0; i < numGpus; i++) + for (int j = 0; j < numGpus; j++) + if (i != j) EnablePeerAccess(i, j); + + // Perform unidirectional / bidirectional + for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) + { + // Print header + if (!ev.outputToCsv) + { + printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni", + ev.useRemoteRead ? "Remote" : "Local", + ev.useRemoteRead ? "Local" : "Remote", + ev.useDmaCopy ? "DMA" : "GFX"); + + if (isBidirectional) + { + printf("%12s", "SRC\\DST"); + } + else + { + if (ev.useRemoteRead) + printf("%12s", "SRC\\EXE+DST"); + else + printf("%12s", "SRC+EXE\\DST"); + } + for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i); + for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i); + printf("\n"); + } + + // Loop over all possible src/dst pairs + for (int src = 0; src < numDevices; src++) + { + MemType const srcType = (src < numCpus ? MEM_CPU : MEM_GPU); + int const srcIndex = (srcType == MEM_CPU ? src : src - numCpus); + + if (!ev.outputToCsv) + printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex); + + for (int dst = 0; dst < numDevices; dst++) + { + MemType const dstType = (dst < numCpus ? MEM_CPU : MEM_GPU); + int const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus); + + double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex); + if (!ev.outputToCsv) + { + if (bandwidth == 0) + printf("%10s", "N/A"); + else + printf("%10.2f", bandwidth); + } + else + { + printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n", + srcType == MEM_CPU ? "CPU" : "GPU", srcIndex, + dstType == MEM_CPU ? "CPU" : "GPU", dstIndex, + isBidirectional ? "bidirectional" : "unidirectional", + ev.useRemoteRead ? "Remote" : "Local", + ev.useDmaCopy ? "DMA" : "GFX", + bandwidth, + N * sizeof(float)); + } + fflush(stdout); + } + if (!ev.outputToCsv) printf("\n"); + } + if (!ev.outputToCsv) printf("\n"); + } +} + +double GetPeakBandwidth(EnvVars const& ev, size_t const N, + int const isBidirectional, + MemType const srcType, int const srcIndex, + MemType const dstType, int const dstIndex) +{ + // Skip bidirectional on same device + if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f; + + // Prepare Transfers + std::vector transfers(2); + transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float); + + // SRC -> DST + transfers[0].numSrcs = transfers[0].numDsts = 1; + transfers[0].srcType.push_back(srcType); + transfers[0].dstType.push_back(dstType); + transfers[0].srcIndex.push_back(srcIndex); + transfers[0].dstIndex.push_back(dstIndex); + + // DST -> SRC + transfers[1].numSrcs = transfers[1].numDsts = 1; + transfers[1].srcType.push_back(dstType); + transfers[1].dstType.push_back(srcType); + transfers[1].srcIndex.push_back(dstIndex); + transfers[1].dstIndex.push_back(srcIndex); + + // Either perform (local read + remote write), or (remote read + local write) + ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX; + transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU; + transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU; + transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex); + transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex); + transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs; + transfers[1].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs; + + // Remove (DST->SRC) if not bidirectional + transfers.resize(isBidirectional + 1); + + // Abort if executing on NUMA node with no CPUs + for (int i = 0; i <= isBidirectional; i++) + { + if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0) + return 0; + +#if defined(__NVCC__) + // NVIDIA platform cannot access GPU memory directly from CPU executors + if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType))) + return 0; +#endif + } + + ExecuteTransfers(ev, 0, N, transfers, false); + + // Collect aggregate bandwidth + double totalBandwidth = 0; + for (int i = 0; i <= isBidirectional; i++) + { + double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations); + double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f; + totalBandwidth += transferBandwidthGbs; + } + return totalBandwidth; +} + +void Transfer::PrepareSubExecParams(EnvVars const& ev) +{ + // Each subExecutor needs to know src/dst pointers and how many elements to transfer + // Figure out the sub-array each subExecutor works on for this Transfer + // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes, + // except the very last one, for alignment reasons + size_t const N = this->numBytesActual / sizeof(float); + int const initOffset = ev.byteOffset / sizeof(float); + int const targetMultiple = ev.blockBytes / sizeof(float); + + // In some cases, there may not be enough data for all subExectors + int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs); + this->subExecParam.clear(); + this->subExecParam.resize(this->numSubExecs); + + size_t assigned = 0; + for (int i = 0; i < this->numSubExecs; ++i) + { + int const subExecLeft = std::max(0, maxSubExecToUse - i); + size_t const leftover = N - assigned; + size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple; + + SubExecParam& p = this->subExecParam[i]; + p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0; + p.numSrcs = this->numSrcs; + p.numDsts = this->numDsts; + for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) + p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset; + for (int iDst = 0; iDst < this->numDsts; ++iDst) + p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset; + + if (ev.enableDebug) + { + printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n", + this->transferIndex, i, p.N, assigned, assigned + p.N); + } + + p.startCycle = 0; + p.stopCycle = 0; + assigned += p.N; + } + + this->transferTime = 0.0; +} + +void Transfer::PrepareReference(EnvVars const& ev, std::vector& buffer, int bufferIdx) +{ + size_t N = buffer.size(); + if (bufferIdx >= 0) + { + size_t patternLen = ev.fillPattern.size(); + if (patternLen > 0) + { + for (size_t i = 0; i < N; ++i) + buffer[i] = ev.fillPattern[i % patternLen]; + } + else + { + for (size_t i = 0; i < N; ++i) + buffer[i] = PrepSrcValue(bufferIdx, i); + } + } + else // Destination buffer + { + if (this->numSrcs == 0) + { + // Note: 0x75757575 = 13323083.0 + memset(buffer.data(), MEMSET_CHAR, N * sizeof(float)); + } + else + { + PrepareReference(ev, buffer, 0); + + if (this->numSrcs > 1) + { + std::vector temp(N); + for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx) + { + PrepareReference(ev, temp, srcIdx); + for (int i = 0; i < N; ++i) + { + buffer[i] += temp[i]; + } + } + } + } + } +} + +bool Transfer::PrepareSrc(EnvVars const& ev) +{ + if (this->numSrcs == 0) return true; + size_t const N = this->numBytesActual / sizeof(float); + int const initOffset = ev.byteOffset / sizeof(float); + + std::vector reference(N); + for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx) + { + float* srcPtr = this->srcMem[srcIdx] + initOffset; + PrepareReference(ev, reference, srcIdx); + + // Initialize source memory array with reference pattern + if (IsGpuType(this->srcType[srcIdx])) + { + int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false); + HIP_CALL(hipSetDevice(deviceIdx)); + if (ev.usePrepSrcKernel) + PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx); + else + HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault)); + HIP_CALL(hipDeviceSynchronize()); + } + else if (IsCpuType(this->srcType[srcIdx])) + { + memcpy(srcPtr, reference.data(), this->numBytesActual); + } + + // Perform check just to make sure that data has been copied properly + float* srcCheckPtr = srcPtr; + std::vector srcCopy(N); + if (IsGpuType(this->srcType[srcIdx])) + { + if (!ev.validateDirect) + { + HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault)); + HIP_CALL(hipDeviceSynchronize()); + srcCheckPtr = srcCopy.data(); + } + } + + for (size_t i = 0; i < N; ++i) + { + if (reference[i] != srcCheckPtr[i]) + { + printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx); +#if !defined(__NVCC__) + float const val = this->srcMem[srcIdx][initOffset + i]; + printf("[ERROR] SRC %02d value: %10.5f [%08X] Direct: %10.5f [%08X]\n", + srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val); +#else + printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]); +#endif + printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]); + printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n", + this->transferIndex, + this->SrcToStr().c_str(), + ExeTypeStr[this->exeType], this->exeIndex, + this->numSubExecs, + this->DstToStr().c_str()); + if (!ev.continueOnError) + exit(1); + return false; + } + } + } + return true; +} + +void Transfer::ValidateDst(EnvVars const& ev) +{ + if (this->numDsts == 0) return; + size_t const N = this->numBytesActual / sizeof(float); + int const initOffset = ev.byteOffset / sizeof(float); + + std::vector reference(N); + PrepareReference(ev, reference, -1); + + std::vector hostBuffer(N); + for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx) + { + float* output; + if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect) + { + output = this->dstMem[dstIdx] + initOffset; + } + else + { + int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false); + HIP_CALL(hipSetDevice(deviceIdx)); + HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault)); + HIP_CALL(hipDeviceSynchronize()); + output = hostBuffer.data(); + } + + for (size_t i = 0; i < N; ++i) + { + if (reference[i] != output[i]) + { + printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx); + for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx) + { + float srcVal; + HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault)); +#if !defined(__NVCC__) + float val = this->srcMem[srcIdx][initOffset + i]; + printf("[ERROR] SRC %02dD value: %10.5f [%08X] Direct: %10.5f [%08X]\n", + srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val); +#else + printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal); +#endif + } + printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]); +#if !defined(__NVCC__) + float dstVal = this->dstMem[dstIdx][initOffset + i]; + printf("[ERROR] DST %02d value: %10.5f [%08X] Direct: %10.5f [%08X]\n", + dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal); +#else + printf("[ERROR] DST %02d value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]); +#endif + printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n", + this->transferIndex, + this->SrcToStr().c_str(), + ExeTypeStr[this->exeType], this->exeIndex, + this->numSubExecs, + this->DstToStr().c_str()); + if (!ev.continueOnError) + exit(1); + else + break; + } + } + } +} + +std::string Transfer::SrcToStr() const +{ + if (numSrcs == 0) return "N"; + std::stringstream ss; + for (int i = 0; i < numSrcs; ++i) + ss << MemTypeStr[srcType[i]] << srcIndex[i]; + return ss.str(); +} + +std::string Transfer::DstToStr() const +{ + if (numDsts == 0) return "N"; + std::stringstream ss; + for (int i = 0; i < numDsts; ++i) + ss << MemTypeStr[dstType[i]] << dstIndex[i]; + return ss.str(); +} + +// NOTE: This is a stop-gap solution until HIP provides wallclock values +int GetWallClockRate(int deviceId) +{ + static std::vector wallClockPerDeviceMhz; + + if (wallClockPerDeviceMhz.size() == 0) + { + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + wallClockPerDeviceMhz.resize(numGpuDevices); + + for (int i = 0; i < numGpuDevices; i++) + { +#if defined(__NVCC__) + int value = 1410000; + //HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i)); + //value *= 1000; +#else + hipDeviceProp_t prop; + HIP_CALL(hipGetDeviceProperties(&prop, i)); + int value = 25000; + switch (prop.gcnArch) + { + case 906: case 910: value = 25000; break; + default: + printf("Unrecognized GCN arch %d\n", prop.gcnArch); + } +#endif + wallClockPerDeviceMhz[i] = value; + } + } + return wallClockPerDeviceMhz[deviceId]; +} + +void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom) +{ + ev.DisplaySweepEnvVars(); + + // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets) + std::vector> exeList; + for (auto exe : ev.sweepExe) + { + ExeType const exeType = CharToExeType(exe); + if (IsGpuType(exeType)) + { + for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex) + exeList.push_back(std::make_pair(exeType, exeIndex)); + } + else if (IsCpuType(exeType)) + { + for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex) + { + // Skip NUMA nodes that have no CPUs (e.g. CXL) + if (ev.numCpusPerNuma[exeIndex] == 0) continue; + exeList.push_back(std::make_pair(exeType, exeIndex)); + } + } + } + int numExes = exeList.size(); + + std::vector> srcList; + for (auto src : ev.sweepSrc) + { + MemType const srcType = CharToMemType(src); + int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices; + + for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex) + srcList.push_back(std::make_pair(srcType, srcIndex)); + } + int numSrcs = srcList.size(); + + + std::vector> dstList; + for (auto dst : ev.sweepDst) + { + MemType const dstType = CharToMemType(dst); + int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices; + + for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex) + dstList.push_back(std::make_pair(dstType, dstIndex)); + } + int numDsts = dstList.size(); + + // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count) + struct TransferInfo + { + MemType srcType; int srcIndex; + ExeType exeType; int exeIndex; + MemType dstType; int dstIndex; + }; + + // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI + bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0); + + std::vector possibleTransfers; + TransferInfo tinfo; + for (int i = 0; i < numExes; ++i) + { + // Skip CPU executors if XGMI link must be used + if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue; + tinfo.exeType = exeList[i].first; + tinfo.exeIndex = exeList[i].second; + + bool isXgmiSrc = false; + int numHopsSrc = 0; + for (int j = 0; j < numSrcs; ++j) + { + if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first)) + { + if (exeList[i].second != srcList[j].second) + { +#if defined(__NVCC__) + isXgmiSrc = false; +#else + uint32_t exeToSrcLinkType, exeToSrcHopCount; + HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false), + RemappedIndex(srcList[j].second, false), + &exeToSrcLinkType, + &exeToSrcHopCount)); + isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI); + if (isXgmiSrc) numHopsSrc = exeToSrcHopCount; +#endif + } + else + { + isXgmiSrc = true; + numHopsSrc = 0; + } + + // Skip this SRC if it is not XGMI but only XGMI links may be used + if (useXgmiOnly && !isXgmiSrc) continue; + + // Skip this SRC if XGMI distance is already past limit + if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue; + } + else if (useXgmiOnly) continue; + + tinfo.srcType = srcList[j].first; + tinfo.srcIndex = srcList[j].second; + + bool isXgmiDst = false; + int numHopsDst = 0; + for (int k = 0; k < numDsts; ++k) + { + if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first)) + { + if (exeList[i].second != dstList[k].second) + { +#if defined(__NVCC__) + isXgmiSrc = false; +#else + uint32_t exeToDstLinkType, exeToDstHopCount; + HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false), + RemappedIndex(dstList[k].second, false), + &exeToDstLinkType, + &exeToDstHopCount)); + isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI); + if (isXgmiDst) numHopsDst = exeToDstHopCount; +#endif + } + else + { + isXgmiDst = true; + numHopsDst = 0; + } + } + + // Skip this DST if it is not XGMI but only XGMI links may be used + if (useXgmiOnly && !isXgmiDst) continue; + + // Skip this DST if total XGMI distance (SRC + DST) is less than min limit + if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue; + + // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit + if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue; + +#if defined(__NVCC__) + // Skip CPU executors on GPU memory on NVIDIA platform + if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first))) + continue; +#endif + + tinfo.dstType = dstList[k].first; + tinfo.dstIndex = dstList[k].second; + + possibleTransfers.push_back(tinfo); + } + } + } + + int const numPossible = (int)possibleTransfers.size(); + int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax); + + if (ev.sweepMin > numPossible) + { + printf("No valid test configurations exist\n"); + return; + } + + if (ev.outputToCsv) + { + printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms)," + "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n"); + } + + int numTestsRun = 0; + int M = ev.sweepMin; + std::uniform_int_distribution randSize(1, numBytesPerTransfer / sizeof(float)); + std::uniform_int_distribution distribution(ev.sweepMin, maxParallelTransfers); + + // Log sweep to configuration file + FILE *fp = fopen("lastSweep.cfg", "w"); + if (!fp) + { + printf("[ERROR] Unable to open lastSweep.cfg. Check permissions\n"); + exit(1); + } + + // Create bitmask of numPossible triplets, of which M will be chosen + std::string bitmask(M, 1); bitmask.resize(numPossible, 0); + auto cpuStart = std::chrono::high_resolution_clock::now(); + while (1) + { + if (isRandom) + { + // Pick random number of simultaneous transfers to execute + // NOTE: This currently skews distribution due to some #s having more possibilities than others + M = distribution(*ev.generator); + + // Generate a random bitmask + for (int i = 0; i < numPossible; i++) + bitmask[i] = (i < M) ? 1 : 0; + std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator); + } + + // Convert bitmask to list of Transfers + std::vector transfers; + for (int value = 0; value < numPossible; ++value) + { + if (bitmask[value]) + { + // Convert integer value to (SRC->EXE->DST) triplet + Transfer transfer; + transfer.numSrcs = 1; + transfer.numDsts = 1; + transfer.srcType = {possibleTransfers[value].srcType}; + transfer.srcIndex = {possibleTransfers[value].srcIndex}; + transfer.exeType = possibleTransfers[value].exeType; + transfer.exeIndex = possibleTransfers[value].exeIndex; + transfer.dstType = {possibleTransfers[value].dstType}; + transfer.dstIndex = {possibleTransfers[value].dstIndex}; + transfer.numSubExecs = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs; + transfer.transferIndex = transfers.size(); + transfer.numBytes = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0; + transfers.push_back(transfer); + } + } + + LogTransfers(fp, ++numTestsRun, transfers); + ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers); + + // Check for test limit + if (numTestsRun == ev.sweepTestLimit) + { + printf("Test limit reached\n"); + break; + } + + // Check for time limit + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + double totalCpuTime = std::chrono::duration_cast>(cpuDelta).count(); + if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit) + { + printf("Time limit exceeded\n"); + break; + } + + // Increment bitmask if not random sweep + if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end())) + { + M++; + // Check for completion + if (M > maxParallelTransfers) + { + printf("Sweep complete\n"); + break; + } + for (int i = 0; i < numPossible; i++) + bitmask[i] = (i < M) ? 1 : 0; + } + } + fclose(fp); +} + +void LogTransfers(FILE *fp, int const testNum, std::vector const& transfers) +{ + fprintf(fp, "# Test %d\n", testNum); + fprintf(fp, "%d", -1 * (int)transfers.size()); + for (auto const& transfer : transfers) + { + fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)", + MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0], + ExeTypeStr[transfer.exeType], transfer.exeIndex, + MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0], + transfer.numSubExecs, + transfer.numBytes); + } + fprintf(fp, "\n"); + fflush(fp); +} + +std::string PtrVectorToStr(std::vector const& strVector, int const initOffset) +{ + std::stringstream ss; + for (int i = 0; i < strVector.size(); ++i) + { + if (i) ss << " "; + ss << (strVector[i] + initOffset); + } + return ss.str(); +} diff --git a/transferbench.so/tests.cmake b/transferbench.so/tests.cmake new file mode 100644 index 00000000..c1b6b74e --- /dev/null +++ b/transferbench.so/tests.cmake @@ -0,0 +1,27 @@ +################################################################################ +## +## Copyright (c) 2018 ROCm Developer Tools +## +## MIT LICENSE: +## Permission is hereby granted, free of charge, to any person obtaining a copy of +## this software and associated documentation files (the "Software"), to deal in +## the Software without restriction, including without limitation the rights to +## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +## of the Software, and to permit persons to whom the Software is furnished to do +## so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +## SOFTWARE. +## +################################################################################ + + +include(tests_conf_logging) diff --git a/transferbench.so/transferbench/CHANGELOG.md b/transferbench.so/transferbench/CHANGELOG.md new file mode 100644 index 00000000..76932de4 --- /dev/null +++ b/transferbench.so/transferbench/CHANGELOG.md @@ -0,0 +1,199 @@ +# Changelog for TransferBench + +## v1.22 +### Modified +- Switching kernel timing function to wall_clock64 + +## v1.21 +### Fixed +- Fixed bug with SAMPLING_FACTOR + +## v1.20 +### Fixed +- VALIDATE_DIRECT can now be used with USE_PREP_KERNEL +- Switch to local GPU for validating GPU memory + +## v1.19 +### Added +- VALIDATE_DIRECT now also applies to source memory array checking +- Adding null memory pointer check prior to deallocation + +## v1.18 +### Added +- Adding ability to validate GPU destination memory directly without going through CPU staging buffer (VALIDATE_DIRECT) + - NOTE: This will only work on AMD devices with large-bar access enable and may slow things down considerably +### Changed +- Refactored how environment variables are displayed +- Mismatch stops after first detected error within an array instead of list all mismatched elements + +## v1.17 +### Added +- Allow switch to GFX kernel for source array initialization (USE_PREP_KERNEL) + - USE_PREP_KERNEL cannot be used with FILL_PATTERN +- Adding ability to compile with nvcc only (TransferBenchCuda) +### Changed +- Default pattern set to [Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)] +### Fixed +- Re-adding example.cfg file + +## v1.16 +### Added +- Additional src array validation during preparation +- Adding new env var CONTINUE_ON_ERROR to resume tests after mis-match detection +- Initializing GPU memory to 0 during allocation + +## v1.15 +### Fixed +- Fixed a bug that prevented single Transfers > 8GB +### Changed +- Removed "check for latest ROCm" warning when allocating too much memory +- Printing off source memory value as well when mis-match is detected + +## v1.14 +### Added +- Added documentation +- Added pthread linking in src/Makefile and CMakeLists.txt +- Added printing off the hex value of the floats for output and reference + +## v1.13 +### Added +- Added support for cmake + +### Changed +- Converted to the Pitchfork layout standard + +## v1.12 +### Added +- Added support for TransferBench on NVIDIA platforms (via HIP_PLATFORM=nvidia) + - CPU executors on NVIDIA platform cannot access GPU memory (no large-bar access) + +## v1.11 +### Added +- New multi-input / multi-output support (MIMO). Transfers now can reduce (element-wise summation) multiple input memory arrays + and write the sums to multiple outputs +- New GPU-DMA executor 'D' (uses hipMemcpy for SDMA copies). Previously this was done using USE_HIP_CALL, but now this allows + GPU-GFX kernel to run in parallel with GPU-DMA instead of applying to all GPU executors globally. + - GPU-DMA executor can only be used for single-input/single-output Transfers + - GPU-DMA executor can only be associated with one SubExecutor +- Added new "Null" memory type 'N', which represents empty memory. This allows for read-only or write-only Transfers +- Added new GPU_KERNEL environment variable that allows for switching between various GPU-GFX reduction kernels + +### Optimized +- Slightly improved GPU-GFX kernel performance based on hardware architecture when running with fewer CUs + +### Changed +- Updated the example.cfg file to cover the new features +- Updated output to support MIMO +- Changed CUs/CPUs threads naming to SubExecutors for consistency +- Sweep Preset: + - Default sweep preset executors now includes DMA +- P2P Benchmarks: + - Now only works via "p2p". Removed "p2p_rr", "g2g" and "g2g_rr". + - Setting NUM_CPU_DEVICES=0 can be used to only benchmark GPU devices (like "g2g") + - New environment variable USE_REMOTE_READ replaces "_rr" presets + - New environment variable USE_GPU_DMA=1 replaces USE_HIP_CALL=1 for benchmarking with GPU-DMA Executor + - Number of GPU SubExecutors for benchmark can be specified via NUM_GPU_SE + - Defaults to all CUs for GPU-GFX, 1 for GPU-DMA + - Number of CPU SubExecutors for benchmark can be specified via NUM_CPU_SE +- Psuedo-random input pattern has been slightly adjusted to have different patterns for each input array within same Transfer + +### Removed +- USE_HIP_CALL has been removed. Use GPU-DMA executor 'D' or set USE_GPU_DMA=1 for P2P benchmark presets + - Currently warning will be issued if USE_HIP_CALL is set to 1 and program will terminate +- Removed NUM_CPU_PER_TRANSFER - The number of CPU SubExecutors will be whatever is specified for the Transfer +- Removed USE_MEMSET environment variable. This can now be done via a Transfer using the null memory type + +## v1.10 +### Fixed +- Fix incorrect bandwidth calculation when using single stream mode and per-Transfer data sizes + +## v1.09 +### Added +- Printing off src/dst memory addresses during interactive mode +### Changed +- Switching to numa_set_preferred instead of set_mempolicy + +## v1.08 +### Changed +- Fixing handling of non-configured NUMA nodes +- Topology detection now shows actual NUMA node indices +- Fix for issue with NUM_GPU_DEVICES + +## v1.07 +### Changed +- Fix bug with allocations involving non-default CPU memory types + +## v1.06 +### Added +- Added unpinned CPU memory type ('U'). May require HSA_XNACK=1 in order to access via GPU executors +- Adding logging of sweep configuration to lastSweep.cfg +- Adding ability to specify number of CUs to use for sweep-based presets +### Changed +- Fixing random sweep repeatibility +- Fixing bug with CPU NUMA node memory allocation +- Modified advanced configuration file format to accept bytes per Transfer + +## v1.05 +### Added +- Topology output now includes NUMA node information +- Support for NUMA nodes with no CPU cores (e.g. CXL memory) +### Removed +- SWEEP_SRC_IS_EXE environment variable + +## v1.04 +### Added +- New environment variables for sweep based presets + - SWEEP_XGMI_MIN - Min number of XGMI hops for Transfers + - SWEEP_XGMI_MAX - Max number of XGMI hops for Transfers + - SWEEP_SEED - Random seed being used + - SWEEP_RAND_BYTES - Use random amount of bytes (up to pre-specified N) for each Transfer +### Changed + - CSV output for sweep includes env vars section followed by output + - CSV output no longer lists env var parameters in columns + - Default number of warmup iterations changed from 3 to 1 + - Splitting CSV output of link type to ExeToSrcLinkType and ExeToDstLinkType + +## v1.03 +### Added +- New preset modes stress-test benchmarks "sweep" and "randomsweep" + - sweep iterates over all possible sets of Transfers to test + - randomsweep iterates over random sets of Transfers + - New sweep-only environment variables can modify sweep + - SWEEP_SRC - String containing only "B","C","F", or "G", defining possible source memory types + - SWEEP_EXE - String containing only "C", or "G", defining possible executors + - SWEEP_DST - String containing only "B","C","F", or "G", defining possible destination memory types + - SWEEP_SRC_IS_EXE - Restrict executor to be the same as the source if non-zero + - SWEEP_MIN - Minimum number of parallel transfers to test + - SWEEP_MAX - Maximum number of parallel transfers to test + - SWEEP_COUNT - Maximum number of tests to run + - SWEEP_TIME_LIMIT - Maximum number of seconds to run tests for +- New environment variable to restrict number of available GPUs to test on (primarily for sweep runs) + - NUM_CPU_DEVICES - Number of CPU devices + - NUM_GPU_DEVICES - Number of GPU devices +### Changed +- Fixed timing display for CPU-executors when using single stream mode + +## v1.02 +### Added +- Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test +### Changed +- Copies are now refered to as Transfers instead of Links +- Re-ordering how env vars are displayed (alphabetically now) +### Removed +- Combined timing is now always on for kernel-based GPU copies. COMBINED_TIMING env var has been removed +- Use single sync is no longer supported to facility variable iterations. USE_SINGLE_SYNC env var has been removed + +## v1.01 +### Added +- Adding USE_SINGLE_STREAM feature + - All Links that execute on the same GPU device are executed with a single kernel launch on a single stream + - Does not work with USE_HIP_CALL and forces USE_SINGLE_SYNC to collect timings + - Adding ability to request coherent / fine-grained host memory ('B') +### Changed +- Separating TransferBench from RCCL repo +- Peer-to-peer benchmark mode now works OUTPUT_TO_CSV +- Toplogy display now works with OUTPUT_TO_CSV +- Moving documentation about config file into example.cfg +### Removed +- Removed config file generation +- Removed show pointer address environment variable (SHOW_ADDR) diff --git a/transferbench.so/transferbench/CMakeLists.txt b/transferbench.so/transferbench/CMakeLists.txt new file mode 100644 index 00000000..18df727b --- /dev/null +++ b/transferbench.so/transferbench/CMakeLists.txt @@ -0,0 +1,31 @@ +# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +if (DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory") +else() + set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory") +endif() +cmake_minimum_required(VERSION 3.5) + +project(TransferBench VERSION 1.0 LANGUAGES CXX) +set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib") +include_directories(${ROCM_PATH}/include) +link_libraries(numa hsa-runtime64 pthread) +set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..) +add_executable(TransferBench src/TransferBench.cpp) +target_include_directories(TransferBench PRIVATE src/include) + +find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH}) +include(ROCMInstallTargets) +include(ROCMCreatePackage) +set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF) + +set(PACKAGE_NAME TB) +set(LIBRARY_NAME TransferBench) + +rocm_install(TARGETS TransferBench) + +rocm_create_package( + NAME ${LIBRARY_NAME} + DESCRIPTION "TransferBench package" + MAINTAINER "RCCL Team " +) diff --git a/transferbench.so/transferbench/LICENSE.md b/transferbench.so/transferbench/LICENSE.md new file mode 100644 index 00000000..cf8533aa --- /dev/null +++ b/transferbench.so/transferbench/LICENSE.md @@ -0,0 +1,19 @@ +Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/transferbench.so/transferbench/README.md b/transferbench.so/transferbench/README.md new file mode 100644 index 00000000..b1155f78 --- /dev/null +++ b/transferbench.so/transferbench/README.md @@ -0,0 +1,63 @@ +# TransferBench + +TransferBench is a simple utility capable of benchmarking simultaneous copies between user-specified devices (CPUs/GPUs). + +## Requirements + +1. ROCm stack installed on the system (HIP runtime) +2. libnuma installed on system + +## Documentation + +Run the steps below to build documentation locally. + +``` +cd docs + +pip3 install -r .sphinx/requirements.txt + +python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html +``` + +## Building + To build TransferBench using Makefile: + ```shell + $ make + ``` + + To build TransferBench using cmake: + ```shell +$ mkdir build +$ cd build +$ CXX=/opt/rocm/bin/hipcc cmake .. +$ make + ``` + + If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately + +## NVIDIA platform support + +TransferBench may also be built to run on NVIDIA platforms either via HIP, or native nvcc + +To build with HIP for NVIDIA (requires HIP-compatible CUDA version installed e.g. CUDA 11.5): +``` + CUDA_PATH= HIP_PLATFORM=nvidia make` +``` + +To build with native nvcc: (Builds TransferBenchCuda) +``` + make +``` + +## Hints and suggestions +- Running TransferBench with no arguments will display usage instructions and detected topology information +- There are several preset configurations that can be used instead of a configuration file + including: + - p2p - Peer to peer benchmark test + - sweep - Sweep across possible sets of Transfers + - rsweep - Random sweep across possible sets of Transfers +- When using the same GPU executor in multiple simultaneous Transfers, performance may be + serialized due to the maximum number of hardware queues available. + - The number of maximum hardware queues can be adjusted via GPU_MAX_HW_QUEUES + - Alternatively, running in single stream mode (USE_SINGLE_STREAM=1) may avoid this issue + by launching all Transfers on a single stream instead of individual streams diff --git a/transferbench.so/transferbench/docs/.doxygen/Doxyfile b/transferbench.so/transferbench/docs/.doxygen/Doxyfile new file mode 100644 index 00000000..fbebc2a9 --- /dev/null +++ b/transferbench.so/transferbench/docs/.doxygen/Doxyfile @@ -0,0 +1,2455 @@ +# Doxyfile 1.8.10 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "hipblas" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = v4.0.5.0 + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HiP" + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = docBin + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = YES + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = YES + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = YES + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = ../../src/ \ + ../../src/include + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf, *.as and *.js. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.as \ + *.js + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = ../README.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# compiled with the --with-libclang option. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 1 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = YES + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /