From feb34b01b32e42e9b148501d09254c681857f3ac Mon Sep 17 00:00:00 2001 From: Mohammed Junaid Date: Tue, 27 Aug 2024 14:02:11 -0500 Subject: [PATCH 1/5] strided batched matrix init. (using hiprand) --- CMakeLists.txt | 3 + gm.so/tests.cmake | 3 +- gpup.so/tests.cmake | 3 +- include/rvs_blas.h | 4 ++ pesm.so/tests.cmake | 5 +- rvs/CMakeLists.txt | 5 +- rvs/tests.cmake | 7 +- smqt.so/tests.cmake | 5 +- src/rvs_blas.cpp | 165 +++++++++++++++++++++++++++++--------------- 9 files changed, 134 insertions(+), 66 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bce31b9..7867e3af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -443,6 +443,9 @@ add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/rvs_smi-build/librocm_smi64.so endif() # if (RVS_ROCMSMI EQUAL 1) +set(HIPRAND_INC_DIR "${ROCM_PATH}/include") +set(HIPRAND_LIB_DIR "${ROCM_PATH}/lib") + if (RVS_ROCBLAS EQUAL 1) set(ROCBLAS_INC_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install") set(ROCBLAS_LIB_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install/lib/") diff --git a/gm.so/tests.cmake b/gm.so/tests.cmake index 0a4fac61..b360065c 100644 --- a/gm.so/tests.cmake +++ b/gm.so/tests.cmake @@ -24,12 +24,13 @@ ################################################################################ set(ROCBLAS_LIB "rocblas") +set(HIPRAND_LIB "hiprand") set(ROC_THUNK_NAME "hsakmt") set(CORE_RUNTIME_NAME "hsa-runtime") set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64") set(UT_LINK_LIBS libpthread.so libpci.so libm.so libdl.so "lib${ROCM_SMI_LIB}.so" - ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${YAML_CPP_LIBRARIES} + ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${YAML_CPP_LIBRARIES} ${HIPRAND_LIB} ) # Add directories to look for library files to link diff --git a/gpup.so/tests.cmake b/gpup.so/tests.cmake index fabf2c5c..9a1f7ed5 100644 --- a/gpup.so/tests.cmake +++ b/gpup.so/tests.cmake @@ -24,12 +24,13 @@ ################################################################################ set(ROCBLAS_LIB "rocblas") +set(HIPRAND_LIB "hiprand") set(ROC_THUNK_NAME "hsakmt") set(CORE_RUNTIME_NAME "hsa-runtime") set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64") set(UT_LINK_LIBS libpthread.so libm.so libdl.so ${ROCM_SMI_LIB} - ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${YAML_CPP_LIBRARIES}) + ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${YAML_CPP_LIBRARIES} ${HIPRAND_LIB}) # Add directories to look for library files to link link_directories(${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR}) diff --git a/include/rvs_blas.h b/include/rvs_blas.h index 2cf898e9..4830d37d 100644 --- a/include/rvs_blas.h +++ b/include/rvs_blas.h @@ -40,6 +40,7 @@ #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" #include +#include typedef void (*rvsBlasCallback_t) (bool status, void *userData); @@ -215,6 +216,9 @@ class rvs_blas { //! HIP API stream - used to query for GEMM completion hipStream_t hip_stream; + //! random number generator + hiprandGenerator_t hiprand_generator; + //! rocBlas related handle rocblas_handle blas_handle; //! TRUE is rocBlas handle was successfully initialized diff --git a/pesm.so/tests.cmake b/pesm.so/tests.cmake index 7b620cb3..2c726580 100644 --- a/pesm.so/tests.cmake +++ b/pesm.so/tests.cmake @@ -24,16 +24,17 @@ ################################################################################ set(ROCBLAS_LIB "rocblas") +set(HIPRAND_LIB "hiprand") set(ROC_THUNK_NAME "hsakmt") set(CORE_RUNTIME_NAME "hsa-runtime") set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64") set(UT_LINK_LIBS libpthread.so libpci.so libm.so libdl.so "lib${ROCM_SMI_LIB}.so" - ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${YAML_CPP_LIBRARIES} + ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${YAML_CPP_LIBRARIES} ${HIPRAND_LIB} ) # Add directories to look for library files to link -link_directories(${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR}) +link_directories(${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} ${HIPRAND_LIB_DIR}) set (UT_SOURCES test/unitactionbase.cpp ) diff --git a/rvs/CMakeLists.txt b/rvs/CMakeLists.txt index 3909be80..fc0118e8 100644 --- a/rvs/CMakeLists.txt +++ b/rvs/CMakeLists.txt @@ -115,19 +115,20 @@ endif() ## define include directories include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS}) ## define lib directories -link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH}) +link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH} ${HIPRAND_LIB_PATH}) ## additional libraries set(ROCBLAS_LIB "rocblas") set(ROC_THUNK_NAME "hsakmt") set(CORE_RUNTIME_NAME "hsa-runtime") +set(HIPRAND_LIB "hiprand") set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64") set(PROJECT_LINK_LIBS libdl.so libpthread.so libpci.so ${YAML_CPP_LIBRARIES}) ## define target add_executable(${RVS_TARGET} src/rvs.cpp) target_link_libraries(${RVS_TARGET} rvslib - ${ROCBLAS_LIB} ${ROCM_SMI_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${PROJECT_LINK_LIBS}) + ${ROCBLAS_LIB} ${ROCM_SMI_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${PROJECT_LINK_LIBS} ${HIPRAND_LIB}) add_dependencies(${RVS_TARGET} rvslib) install(TARGETS ${RVS_TARGET} diff --git a/rvs/tests.cmake b/rvs/tests.cmake index e9256368..c5194822 100644 --- a/rvs/tests.cmake +++ b/rvs/tests.cmake @@ -31,17 +31,18 @@ # add_dependencies(rvstest rvshelper) set(ROCBLAS_LIB "rocblas") +set(HIPRAND_LIB "hiprand") set(ROC_THUNK_NAME "hsakmt") set(CORE_RUNTIME_NAME "hsa-runtime") set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64") ## define lib directories -link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR}) +link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR} ${HIPRAND_LIB_DIR}) ## define target for "test-to-fail" add_executable(${RVS_TARGET}fail src/rvs.cpp) target_link_libraries(${RVS_TARGET}fail rvslib rvslibut ${PROJECT_LINK_LIBS} - ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${ROCM_CORE} ${CORE_RUNTIME_TARGET}) + ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${ROCM_CORE} ${CORE_RUNTIME_TARGET} ${HIPRAND_LIB}) target_compile_definitions(${RVS_TARGET}fail PRIVATE RVS_INVERT_RETURN_STATUS) set_target_properties(${RVS_TARGET}fail PROPERTIES @@ -210,7 +211,7 @@ FOREACH(SINGLE_TEST ${TESTSOURCES}) ${PROJECT_LINK_LIBS} ${PROJECT_TEST_LINK_LIBS} rvslib rvslibut gtest_main gtest pthread - ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} + ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${HIPRAND_LIB} ) add_dependencies(${TEST_NAME} rvs_gtest_target) diff --git a/smqt.so/tests.cmake b/smqt.so/tests.cmake index e9cad06c..76766ded 100644 --- a/smqt.so/tests.cmake +++ b/smqt.so/tests.cmake @@ -24,16 +24,17 @@ ################################################################################ set(ROCBLAS_LIB "rocblas") +set(HIPRAND_LIB "hiprand") set(ROC_THUNK_NAME "hsakmt") set(CORE_RUNTIME_NAME "hsa-runtime") set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64") set(UT_LINK_LIBS libpthread.so libpci.so libm.so libdl.so "lib${ROCM_SMI_LIB}.so" - ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${YAML_CPP_LIBRARIES} + ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${ROCM_CORE} ${YAML_CPP_LIBRARIES} ${HIPRAND_LIB} ) # Add directories to look for library files to link -link_directories(${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR}) +link_directories(${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} ${HIPRAND_LIB_DIR}) set (UT_SOURCES src/action.cpp test/unitsmqt.cpp ) diff --git a/src/rvs_blas.cpp b/src/rvs_blas.cpp index 3a82a1a3..437f9f29 100644 --- a/src/rvs_blas.cpp +++ b/src/rvs_blas.cpp @@ -1,6 +1,6 @@ /******************************************************************************** * - * Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. * * MIT LICENSE: * Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -210,6 +210,20 @@ bool rvs_blas::init_gpu_device(void) { return false; } + if("hiprand" == matrix_init) { + + // Create hipRAND generator, assign stream. + if(hiprandCreateGenerator(&hiprand_generator, HIPRAND_RNG_PSEUDO_DEFAULT) != HIPRAND_STATUS_SUCCESS) { + std::cout << "\n hiprandCreateGenerator() failed !!!" << "\n"; + return false; + } + + if(hiprandSetStream(hiprand_generator, hip_stream) != HIPRAND_STATUS_SUCCESS) { + std::cout << "\n hiprandSetStream() failed !!!" << "\n"; + return false; + } + } + is_handle_init = true; return true; } @@ -220,6 +234,12 @@ bool rvs_blas::init_gpu_device(void) { */ bool rvs_blas::copy_data_to_gpu(std::string ops_type) { + if("hiprand" == matrix_init) { + + // hipRAND no need for allocation in host memory, so no host to device copy ! + return true; + } + if(ops_type == "sgemm") { if (da) { @@ -522,6 +542,12 @@ void rvs_blas::release_gpu_matrix_mem(void) { */ bool rvs_blas::alocate_host_matrix_mem(void) { + if("hiprand" == matrix_init) { + + // hipRAND no need for allocation in host memory + return true; + } + try { if(ops_type == "sgemm") { @@ -798,86 +824,115 @@ bool rvs_blas::run_blass_gemm(std::string ops_type) { */ void rvs_blas::generate_random_matrix_data(void) { - size_t i; if (!is_error) { - uint64_t nextr = (uint64_t) time(NULL); - if(ops_type == "sgemm") { + if("hiprand" == matrix_init) { - //SGEMM stuff - for (i = 0; i < size_a; ++i) - ha[i] = fast_pseudo_rand(&nextr, i); + if(ops_type == "dgemm") { - for (i = 0; i < size_b; ++i) - hb[i] = fast_pseudo_rand(&nextr, i); + if(hiprandGenerateUniformDouble(hiprand_generator, ddbla, size_a) != HIPRAND_STATUS_SUCCESS) { + std::cout << "\n hiprandGenerateUniformDouble() failed !!!" << "\n"; + return; + } - for (int i = 0; i < size_c; ++i) - hc[i] = fast_pseudo_rand(&nextr, i); + if(hiprandGenerateUniformDouble(hiprand_generator, ddblb, size_b) != HIPRAND_STATUS_SUCCESS) { + std::cout << "\n hiprandGenerateUniformDouble() failed !!!" << "\n"; + return; + } + + if(hiprandGenerateUniformDouble(hiprand_generator, ddblc, size_c) != HIPRAND_STATUS_SUCCESS) { + std::cout << "\n hiprandGenerateUniformDouble() failed !!!" << "\n"; + return; + } + + if(hipStreamSynchronize(hip_stream) != hipSuccess) { + std::cout << "hipStreamSynchronize() failed !!! for stream " << hip_stream << std::endl; + return; + } + } } + else { - if(ops_type == "dgemm") { + size_t i; + uint64_t nextr = (uint64_t) time(NULL); - //DGEMM stuff - for (i = 0; i < size_a; ++i) - hdbla[i] = (double)fast_pseudo_rand(&nextr, i); + //SGEMM (float fp32_r) + if(ops_type == "sgemm") { - for (i = 0; i < size_b; ++i) - hdblb[i] = (double)fast_pseudo_rand(&nextr, i); + for (i = 0; i < size_a; ++i) + ha[i] = fast_pseudo_rand(&nextr, i); - for (int i = 0; i < size_c; ++i) - hdblc[i] = (double)fast_pseudo_rand(&nextr, i); - } + for (i = 0; i < size_b; ++i) + hb[i] = fast_pseudo_rand(&nextr, i); - if(ops_type == "hgemm") { + for (int i = 0; i < size_c; ++i) + hc[i] = fast_pseudo_rand(&nextr, i); + } - //HGEMM stuff - for (i = 0; i < size_a; ++i) - hhlfa[i] = fast_pseudo_rand(&nextr, i); + //DGEMM (double fp64_r) + if(ops_type == "dgemm") { - for (i = 0; i < size_b; ++i) - hhlfb[i] = fast_pseudo_rand(&nextr, i); + for (i = 0; i < size_a; ++i) + hdbla[i] = (double)fast_pseudo_rand(&nextr, i); - for (int i = 0; i < size_c; ++i) - hhlfc[i] = fast_pseudo_rand(&nextr, i); - } + for (i = 0; i < size_b; ++i) + hdblb[i] = (double)fast_pseudo_rand(&nextr, i); - // 8-bit floating point real (fp8_r) format - if(data_type == "fp8_r") { + for (int i = 0; i < size_c; ++i) + hdblc[i] = (double)fast_pseudo_rand(&nextr, i); + } - for (i = 0; i < size_a; ++i) - ((struct rocblas_f8* )hda)[i] = rocblas_f8(fast_pseudo_rand(&nextr, i)); + //HGEMM (half-float fp16_r) + if(ops_type == "hgemm") { - for (i = 0; i < size_b; ++i) - ((struct rocblas_f8* )hdb)[i] = rocblas_f8(fast_pseudo_rand(&nextr, i)); + for (i = 0; i < size_a; ++i) + hhlfa[i] = fast_pseudo_rand(&nextr, i); - for (i = 0; i < size_c; ++i) - ((struct rocblas_f8* )hdc)[i] = rocblas_f8(fast_pseudo_rand(&nextr, i)); - } + for (i = 0; i < size_b; ++i) + hhlfb[i] = fast_pseudo_rand(&nextr, i); - // 16-bit floating point real (fp16_r) format - if(data_type == "fp16_r") { + for (int i = 0; i < size_c; ++i) + hhlfc[i] = fast_pseudo_rand(&nextr, i); + } - for (i = 0; i < size_a; ++i) - ((rocblas_half* )hda)[i] = rocblas_half(fast_pseudo_rand(&nextr, i)); + // 8-bit floating point real (fp8_r) format + if(data_type == "fp8_r") { - for (i = 0; i < size_b; ++i) - ((rocblas_half* )hdb)[i] = rocblas_half(fast_pseudo_rand(&nextr, i)); + for (i = 0; i < size_a; ++i) + ((struct rocblas_f8* )hda)[i] = rocblas_f8(fast_pseudo_rand(&nextr, i)); - for (i = 0; i < size_c; ++i) - ((rocblas_half* )hdc)[i] = rocblas_half(fast_pseudo_rand(&nextr, i)); - } + for (i = 0; i < size_b; ++i) + ((struct rocblas_f8* )hdb)[i] = rocblas_f8(fast_pseudo_rand(&nextr, i)); - // 16-bit brain floating point real (bf16_r) format - if(data_type == "bf16_r") { + for (i = 0; i < size_c; ++i) + ((struct rocblas_f8* )hdc)[i] = rocblas_f8(fast_pseudo_rand(&nextr, i)); + } + + // 16-bit floating point real (fp16_r) format + if(data_type == "fp16_r") { - for (i = 0; i < size_a; ++i) - ((struct rocblas_bfloat16* )hda)[i] = rocblas_bfloat16(fast_pseudo_rand(&nextr, i)); + for (i = 0; i < size_a; ++i) + ((rocblas_half* )hda)[i] = rocblas_half(fast_pseudo_rand(&nextr, i)); - for (i = 0; i < size_b; ++i) - ((struct rocblas_bfloat16* )hdb)[i] = rocblas_bfloat16(fast_pseudo_rand(&nextr, i)); + for (i = 0; i < size_b; ++i) + ((rocblas_half* )hdb)[i] = rocblas_half(fast_pseudo_rand(&nextr, i)); - for (i = 0; i < size_c; ++i) - ((struct rocblas_bfloat16* )hdc)[i] = rocblas_bfloat16(fast_pseudo_rand(&nextr, i)); + for (i = 0; i < size_c; ++i) + ((rocblas_half* )hdc)[i] = rocblas_half(fast_pseudo_rand(&nextr, i)); + } + + // 16-bit brain floating point real (bf16_r) format + if(data_type == "bf16_r") { + + for (i = 0; i < size_a; ++i) + ((struct rocblas_bfloat16* )hda)[i] = rocblas_bfloat16(fast_pseudo_rand(&nextr, i)); + + for (i = 0; i < size_b; ++i) + ((struct rocblas_bfloat16* )hdb)[i] = rocblas_bfloat16(fast_pseudo_rand(&nextr, i)); + + for (i = 0; i < size_c; ++i) + ((struct rocblas_bfloat16* )hdc)[i] = rocblas_bfloat16(fast_pseudo_rand(&nextr, i)); + } } } } From de66a41c7f5b20f0167bb7198d5a9fae3b503dc8 Mon Sep 17 00:00:00 2001 From: Mohammed Junaid Date: Tue, 27 Aug 2024 14:06:00 -0500 Subject: [PATCH 2/5] 1. Review comment error handling 2. Init. & free hiprand gen. (if present) --- src/rvs_blas.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/rvs_blas.cpp b/src/rvs_blas.cpp index 437f9f29..2b3dfcb1 100644 --- a/src/rvs_blas.cpp +++ b/src/rvs_blas.cpp @@ -103,6 +103,7 @@ rvs_blas::rvs_blas(int _gpu_device_index, int _m, int _n, int _k, std::string _m , hpo(nullptr), hco(nullptr) , hout(nullptr), hdout(nullptr) , hip_stream(nullptr) + , hiprand_generator(nullptr) , blas_handle(nullptr) , is_handle_init(false) , is_error(false) @@ -532,6 +533,8 @@ void rvs_blas::release_gpu_matrix_mem(void) { if (is_handle_init) { rocblas_destroy_handle(blas_handle); + if(hiprand_generator) + hiprandDestroyGenerator(hiprand_generator); hipStreamDestroy(hip_stream); } } @@ -832,21 +835,25 @@ void rvs_blas::generate_random_matrix_data(void) { if(hiprandGenerateUniformDouble(hiprand_generator, ddbla, size_a) != HIPRAND_STATUS_SUCCESS) { std::cout << "\n hiprandGenerateUniformDouble() failed !!!" << "\n"; + is_error = true; return; } if(hiprandGenerateUniformDouble(hiprand_generator, ddblb, size_b) != HIPRAND_STATUS_SUCCESS) { std::cout << "\n hiprandGenerateUniformDouble() failed !!!" << "\n"; + is_error = true; return; } if(hiprandGenerateUniformDouble(hiprand_generator, ddblc, size_c) != HIPRAND_STATUS_SUCCESS) { std::cout << "\n hiprandGenerateUniformDouble() failed !!!" << "\n"; + is_error = true; return; } if(hipStreamSynchronize(hip_stream) != hipSuccess) { std::cout << "hipStreamSynchronize() failed !!! for stream " << hip_stream << std::endl; + is_error = true; return; } } From 1604f1273e5b4df0645156c90d57686b46c387c2 Mon Sep 17 00:00:00 2001 From: Mohammed Junaid Date: Tue, 27 Aug 2024 14:10:52 -0500 Subject: [PATCH 3/5] 1. Faster matrix data init. 2. Lower power target --- rvs/conf/MI300X/gst_stress.conf | 1 + rvs/conf/MI300X/iet_single.conf | 24 ++++++++++++------------ rvs/conf/MI300X/iet_stress.conf | 1 + 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/rvs/conf/MI300X/gst_stress.conf b/rvs/conf/MI300X/gst_stress.conf index 87ade879..d7c03a40 100644 --- a/rvs/conf/MI300X/gst_stress.conf +++ b/rvs/conf/MI300X/gst_stress.conf @@ -59,4 +59,5 @@ actions: ldc: 28000 alpha: 1 beta: 1 + matrix_init: hiprand diff --git a/rvs/conf/MI300X/iet_single.conf b/rvs/conf/MI300X/iet_single.conf index b26599c3..c1fb94f9 100644 --- a/rvs/conf/MI300X/iet_single.conf +++ b/rvs/conf/MI300X/iet_single.conf @@ -23,14 +23,14 @@ # # # ############################################################################### -# Test #1 - iet-620W-1K-rand-dgemm +# Test #1 - iet-400W-1K-rand-dgemm # # Preconditions: # Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g # option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify # Set parallel execution to true # Set matrix_size to 1024 for dgemm operations -# Set target power to 620 Watts +# Set target power to 400 Watts # Set test duration to 1 min # # Run test with: @@ -38,17 +38,17 @@ # ./rvs -c conf/MI300X/iet_single.conf # # Expected result: -# The test on each GPU passes (TRUE) if the GPU power reaches at least 620 Watts, +# The test on each GPU passes (TRUE) if the GPU power reaches at least 400 Watts, # FALSE otherwise actions: -- name: iet-620W-1K-rand-dgemm +- name: iet-400W-1K-rand-dgemm device: all module: iet parallel: true duration: 60000 sample_interval: 3000 - target_power: 620 + target_power: 400 matrix_size: 1024 matrix_init: rand ops_type: dgemm @@ -60,7 +60,7 @@ actions: # option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify # Set parallel execution to true # Set matrix_size to 28000 for dgemm operations -# Set target power to 750 Watts +# Set target power to 750 Watts # Set wait duration to 30 seconds (GPU idle period) # Set test duration to 2 mins # @@ -81,17 +81,17 @@ actions: sample_interval: 3000 target_power: 750 matrix_size: 28000 - matrix_init: rand + matrix_init: hiprand ops_type: dgemm -# Test #3 - iet-wait-620W-1K-rand-dgemm +# Test #3 - iet-wait-400W-1K-rand-dgemm # # Preconditions: # Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g # option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify # Set parallel execution to true # Set matrix_size to 1024 for dgemm operations -# Set target power to 620 Watts +# Set target power to 400 Watts # Set wait duration to 30 seconds (GPU idle period) # Set test duration to 1 min # @@ -100,10 +100,10 @@ actions: # ./rvs -c conf/MI300X/iet_single.conf # # Expected result: -# The test on each GPU passes (TRUE) if the GPU power reaches at least 620 Watts, +# The test on each GPU passes (TRUE) if the GPU power reaches at least 400 Watts, # FALSE otherwise -- name: iet-wait-620W-1K-rand-dgemm +- name: iet-wait-400W-1K-rand-dgemm device: all module: iet parallel: true @@ -111,7 +111,7 @@ actions: duration: 60000 sample_interval: 3000 log_interval: 3000 - target_power: 620 + target_power: 400 matrix_size: 1024 matrix_init: rand ops_type: dgemm diff --git a/rvs/conf/MI300X/iet_stress.conf b/rvs/conf/MI300X/iet_stress.conf index 824533e0..fa2885e3 100644 --- a/rvs/conf/MI300X/iet_stress.conf +++ b/rvs/conf/MI300X/iet_stress.conf @@ -60,4 +60,5 @@ actions: ldc: 28000 alpha: 1 beta: 1 + matrix_init: hiprand From 6e3d83b7fa85a362a2cac78a0954185d2fcea273 Mon Sep 17 00:00:00 2001 From: Mohammed Junaid Date: Tue, 27 Aug 2024 17:39:16 -0500 Subject: [PATCH 4/5] 1. Added matrix init. 2. rvs_blas interfaces function definition changes. --- gst.so/src/gst_worker.cpp | 14 +++++------ iet.so/include/action.h | 7 +++++- iet.so/include/iet_worker.h | 47 ++++++++++++++++++++++--------------- iet.so/src/action.cpp | 11 +++++++++ iet.so/src/iet_worker.cpp | 6 ++--- include/rvs_blas.h | 6 ++--- perf.so/src/perf_worker.cpp | 4 ++-- src/rvs_blas.cpp | 4 ++-- tst.so/src/tst_worker.cpp | 4 ++-- 9 files changed, 64 insertions(+), 39 deletions(-) diff --git a/gst.so/src/gst_worker.cpp b/gst.so/src/gst_worker.cpp index 35ed4194..ee97f8f2 100644 --- a/gst.so/src/gst_worker.cpp +++ b/gst.so/src/gst_worker.cpp @@ -100,7 +100,7 @@ void GSTWorker::setup_blas(int *error, string *err_description) { gpu_blas->generate_random_matrix_data(); if (!copy_matrix) { // copy matrix only once - if (!gpu_blas->copy_data_to_gpu(gst_ops_type)) { + if (!gpu_blas->copy_data_to_gpu()) { *error = 1; *err_description = GST_BLAS_MEMCPY_ERROR; } @@ -137,7 +137,7 @@ void GSTWorker::hit_max_gflops(int *error, string *err_description) { if (copy_matrix) { // copy matrix before each GEMM - if (!gpu_blas->copy_data_to_gpu(gst_ops_type)) { + if (!gpu_blas->copy_data_to_gpu()) { *error = 1; *err_description = GST_BLAS_MEMCPY_ERROR; return; @@ -145,7 +145,7 @@ void GSTWorker::hit_max_gflops(int *error, string *err_description) { } // run GEMM operation - if (!gpu_blas->run_blass_gemm(gst_ops_type)) + if (!gpu_blas->run_blas_gemm()) continue; // failed to run the GEMM operation // Waits for GEMM operation to complete @@ -235,7 +235,7 @@ bool GSTWorker::do_gst_ramp(int *error, string *err_description) { // Generate random matrix data gpu_blas->generate_random_matrix_data(); // copy matrix before each GEMM - if (!gpu_blas->copy_data_to_gpu(gst_ops_type)) { + if (!gpu_blas->copy_data_to_gpu()) { *error = 1; *err_description = GST_BLAS_MEMCPY_ERROR; return false; @@ -246,7 +246,7 @@ bool GSTWorker::do_gst_ramp(int *error, string *err_description) { start_time = gpu_blas->get_time_us(); // run GEMM operation - if(!gpu_blas->run_blass_gemm(gst_ops_type)) + if(!gpu_blas->run_blas_gemm()) continue; // Wait for GEMM operation to complete @@ -436,7 +436,7 @@ bool GSTWorker::do_gst_stress_test(int *error, std::string *err_description) { if (copy_matrix) { // copy matrix before each GEMM - if (!gpu_blas->copy_data_to_gpu(gst_ops_type)) { + if (!gpu_blas->copy_data_to_gpu()) { *error = 1; *err_description = GST_BLAS_MEMCPY_ERROR; return false; @@ -449,7 +449,7 @@ bool GSTWorker::do_gst_stress_test(int *error, std::string *err_description) { for (uint64_t i = 0; i < gst_hot_calls; i++) { // run GEMM operation - if(!gpu_blas->run_blass_gemm(gst_ops_type)) { + if(!gpu_blas->run_blas_gemm()) { *err_description = GST_BLAS_ERROR; *error = 1; diff --git a/iet.so/include/action.h b/iet.so/include/action.h index f3f81bb0..a0caf6d5 100644 --- a/iet.so/include/action.h +++ b/iet.so/include/action.h @@ -77,7 +77,8 @@ class iet_action: public rvs::actionbase { //! TRUE if JSON output is required bool bjson = false; - std::string iet_ops_type; + //! gemm operation type + std::string iet_ops_type; //! target power level for the test float iet_target_power; //! IET test ramp duration @@ -118,6 +119,10 @@ class iet_action: public rvs::actionbase { int iet_ldc_offset; int iet_ldd_offset; + //! matrix initialization method : + //! default, random integer or trignometric float + std::string iet_matrix_init; + friend class IETWorker; //! list of GPUs (along with some identification data) which are diff --git a/iet.so/include/iet_worker.h b/iet.so/include/iet_worker.h index d88ee656..2bc73c47 100644 --- a/iet.so/include/iet_worker.h +++ b/iet.so/include/iet_worker.h @@ -130,16 +130,16 @@ class IETWorker : public rvs::ThreadBase { //! returns the target power level for the test float get_target_power(void) { return target_power; } - //! sets the SGEMM matrix size + //! sets the matrix size void set_matrix_size(uint64_t _matrix_size) { matrix_size = _matrix_size; } - //! returns the SGEMM matrix size + //! returns the matrix size uint64_t get_matrix_size(void) { return matrix_size; } - //! sets the EDPp power tolerance + //! sets gemm operation type void set_iet_ops_type(std::string ops_type) { iet_ops_type = ops_type; } - //! returns the EDPp power tolerance + //! get gemm operation type std::string get_ops_type(void) { return iet_ops_type; } //! sets the EDPp power tolerance @@ -157,10 +157,11 @@ class IETWorker : public rvs::ThreadBase { //! returns the JSON flag static bool get_use_json(void) { return bjson; } - //! returns the SGEMM matrix size + + //! returns the matrix size a uint64_t get_matrix_size_a(void) { return matrix_size_a; } - //! returns the SGEMM matrix size + //! returns the matrix size b uint64_t get_matrix_size_b(void) { return matrix_size_b; } //! returns the matrix size c @@ -199,15 +200,15 @@ class IETWorker : public rvs::ThreadBase { void set_ldd_offset(int ldd) { iet_ldd_offset = ldd; } - //! sets the SGEMM matrix size + //! sets the matrix size a void set_matrix_size_a(uint64_t _matrix_size_a) { matrix_size_a = _matrix_size_a; } - //! sets the SGEMM matrix size + //! sets the matrix size b void set_matrix_size_b(uint64_t _matrix_size_b) { matrix_size_b = _matrix_size_b; } - //! sets the SGEMM matrix size + //! sets the matrix size c void set_matrix_size_c(uint64_t _matrix_size_c) { matrix_size_c = _matrix_size_c; } @@ -218,6 +219,12 @@ class IETWorker : public rvs::ThreadBase { //! returns bandwidth workload status bool get_bw_workload(void) { return iet_bw_workload; } + //! sets the matrix init + void set_matrix_init(std::string _matrix_init) { matrix_init = _matrix_init; } + + //! returns matrix init + std::string get_matrix_init(void) { return matrix_init; } + //! BLAS callback static void blas_callback (bool status, void *user_data); @@ -269,13 +276,13 @@ class IETWorker : public rvs::ThreadBase { //! power tolerance (how much the target_power can fluctuare after //! the ramp period for the test to succeed) float tolerance; - //! SGEMM matrix size + //! matrix size uint64_t matrix_size; //! TRUE if JSON output is required static bool bjson; bool sgemm_success; - //! blas_worker pointer - std::string iet_ops_type; + //! gemm operation type + std::string iet_ops_type; //! actual training time uint64_t training_time_ms; @@ -287,27 +294,29 @@ class IETWorker : public rvs::ThreadBase { float avg_power_training; //! the SGEMM delay which gives the actual GPU SGEMM frequency float sgemm_si_delay; - //! SGEMM matrix size + //! matrix sizes uint64_t matrix_size_a; uint64_t matrix_size_b; uint64_t matrix_size_c; - //leading offsets + //! leading offsets int iet_lda_offset; int iet_ldb_offset; int iet_ldc_offset; int iet_ldd_offset; - //Matrix transpose A + //! Matrix transpose A int iet_trans_a; - //Matrix transpose B + //! Matrix transpose B int iet_trans_b; - //IET aplha value + //! IET aplha value float iet_alpha_val; - //IET beta value + //! IET beta value float iet_beta_val; - //IET TP flag + //! IET TP flag bool iet_tp_flag; //! Bandwidth workload enable/disable bool iet_bw_workload; + //! matrix init + std::string matrix_init; bool endtest = false; //! GEMM operations synchronization mutex diff --git a/iet.so/src/action.cpp b/iet.so/src/action.cpp index 82c9cf0a..c20d8085 100644 --- a/iet.so/src/action.cpp +++ b/iet.so/src/action.cpp @@ -87,6 +87,7 @@ using std::fstream; #define RVS_CONF_TP_FLAG "targetpower_met" #define RVS_TP_MESSAGE "target_power" #define RVS_DTYPE_MESSAGE "dtype" +#define RVS_CONF_MATRIX_INIT "matrix_init" #define MODULE_NAME "iet" #define MODULE_NAME_CAPS "IET" @@ -110,6 +111,7 @@ using std::fstream; #define IET_DEFAULT_LDD_OFFSET 0 #define IET_DEFAULT_TP_FLAG false #define IET_DEFAULT_BW_WORKLOAD false +#define IET_DEFAULT_MATRIX_INIT "default" #define IET_NO_COMPATIBLE_GPUS "No AMD compatible GPU found!" #define PCI_ALLOC_ERROR "pci_alloc() error" @@ -316,6 +318,14 @@ bool iet_action::get_all_iet_config_keys(void) { bsts = false; } + error = property_get(RVS_CONF_MATRIX_INIT, &iet_matrix_init, IET_DEFAULT_MATRIX_INIT); + if (error == 1) { + msg = "invalid '" + + std::string(RVS_CONF_MATRIX_INIT) + "' key value"; + rvs::lp::Err(msg, MODULE_NAME_CAPS, action_name); + bsts = false; + } + /* Set minimum sample interval as default */ if (iet_sample_interval < IET_DEFAULT_SAMPLE_INTERVAL) { iet_sample_interval = IET_DEFAULT_SAMPLE_INTERVAL; @@ -507,6 +517,7 @@ bool iet_action::do_edp_test(map iet_gpus_device_index) { workers[i].set_ldd_offset(iet_ldd_offset); workers[i].set_tp_flag(iet_tp_flag); workers[i].set_bw_workload(iet_bw_workload); + workers[i].set_matrix_init(iet_matrix_init); i++; } diff --git a/iet.so/src/iet_worker.cpp b/iet.so/src/iet_worker.cpp index 461f3358..9d4646c3 100644 --- a/iet.so/src/iet_worker.cpp +++ b/iet.so/src/iet_worker.cpp @@ -145,14 +145,14 @@ void IETWorker::blasThread(int gpuIdx, uint64_t matrix_size, std::string iet_o duration = 0; gem_ops = 0; // setup rvsBlas - gpu_blas = std::unique_ptr(new rvs_blas(gpuIdx, matrix_size, matrix_size, matrix_size, "default", transa, transb, alpha, beta, + gpu_blas = std::unique_ptr(new rvs_blas(gpuIdx, matrix_size, matrix_size, matrix_size, matrix_init, transa, transb, alpha, beta, iet_lda_offset, iet_ldb_offset, iet_ldc_offset, iet_ldd_offset, iet_ops_type, "")); //Genreate random matrix data gpu_blas->generate_random_matrix_data(); //Copy data to GPU - gpu_blas->copy_data_to_gpu(iet_ops_type); + gpu_blas->copy_data_to_gpu(); iet_start_time = std::chrono::system_clock::now(); @@ -160,7 +160,7 @@ void IETWorker::blasThread(int gpuIdx, uint64_t matrix_size, std::string iet_o while ((duration < run_duration_ms) && (endtest == false)) { //call the gemm blas - gpu_blas->run_blass_gemm(iet_ops_type); + gpu_blas->run_blas_gemm(); // Waits for GEMM operation to complete if(!gpu_blas->is_gemm_op_complete()) diff --git a/include/rvs_blas.h b/include/rvs_blas.h index 4830d37d..53cddef0 100644 --- a/include/rvs_blas.h +++ b/include/rvs_blas.h @@ -54,7 +54,7 @@ typedef void (*rvsBlasCallback_t) (bool status, void *userData); class rvs_blas { public: rvs_blas(int _gpu_device_index, int _m, int _n, int _k, std::string _matrix_init, - int transa, int transb, float aplha, float beta, + int transa, int transb, float alpha, float beta, rocblas_int lda, rocblas_int ldb, rocblas_int ldc, rocblas_int ldd, std::string _ops_type, std::string _data_type); rvs_blas() = delete; @@ -89,8 +89,8 @@ class rvs_blas { //! returns TRUE if an error occured bool error(void) { return is_error; } void generate_random_matrix_data(void); - bool copy_data_to_gpu(std::string); - bool run_blass_gemm(std::string); + bool copy_data_to_gpu(void); + bool run_blas_gemm(void); bool is_gemm_op_complete(void); bool validate_gemm(bool self_check, bool accu_check, double &self_error, double &accu_error); void set_gemm_error(uint64_t _error_freq, uint64_t _error_count); diff --git a/perf.so/src/perf_worker.cpp b/perf.so/src/perf_worker.cpp index 3f44c22e..7244810c 100644 --- a/perf.so/src/perf_worker.cpp +++ b/perf.so/src/perf_worker.cpp @@ -98,7 +98,7 @@ void PERFWorker::setup_blas(int *error, string *err_description) { gpu_blas->generate_random_matrix_data(); if (!copy_matrix) { // copy matrix only once - if (!gpu_blas->copy_data_to_gpu(perf_ops_type)) { + if (!gpu_blas->copy_data_to_gpu()) { *error = 1; *err_description = PERF_BLAS_MEMCPY_ERROR; } @@ -181,7 +181,7 @@ bool PERFWorker::do_perf_stress_test(int *error, std::string *err_description) { while(num_gemm_ops++ <= perf_hot_calls) { // run GEMM & wait for completion - gpu_blas->run_blass_gemm(perf_ops_type); + gpu_blas->run_blas_gemm(); } //End the timer diff --git a/src/rvs_blas.cpp b/src/rvs_blas.cpp index 2b3dfcb1..773fe1cf 100644 --- a/src/rvs_blas.cpp +++ b/src/rvs_blas.cpp @@ -233,7 +233,7 @@ bool rvs_blas::init_gpu_device(void) { * @brief copy data matrix from host to gpu * @return true if everything went fine, otherwise false */ -bool rvs_blas::copy_data_to_gpu(std::string ops_type) { +bool rvs_blas::copy_data_to_gpu(void) { if("hiprand" == matrix_init) { @@ -665,7 +665,7 @@ bool rvs_blas::is_gemm_op_complete(void) { * @brief performs the GEMM matrix multiplication operations * @return true if GPU was able to enqueue the GEMM operation, otherwise false */ -bool rvs_blas::run_blass_gemm(std::string ops_type) { +bool rvs_blas::run_blas_gemm(void) { if (!is_error) { diff --git a/tst.so/src/tst_worker.cpp b/tst.so/src/tst_worker.cpp index b62e6ab5..110dbc22 100644 --- a/tst.so/src/tst_worker.cpp +++ b/tst.so/src/tst_worker.cpp @@ -162,14 +162,14 @@ void TSTWorker::blasThread(int gpuIdx, uint64_t matrix_size, std::string tst_ops gpu_blas->generate_random_matrix_data(); //Copy data to GPU - gpu_blas->copy_data_to_gpu(tst_ops_type); + gpu_blas->copy_data_to_gpu(); tst_start_time = std::chrono::system_clock::now(); //Hit the GPU with load to increase temperature while ( (duration < run_duration_ms) && (endtest == false) ){ //call the gemm blas - gpu_blas->run_blass_gemm(tst_ops_type); + gpu_blas->run_blas_gemm(); /* Set callback to be called upon completion of blas gemm operations */ gpu_blas->set_callback(blas_callback, (void *)this); From 9fa1e363defa09bc89111814ba3c9792ef7fdcd5 Mon Sep 17 00:00:00 2001 From: Mohammed Junaid <88209527+jkottiku@users.noreply.github.com> Date: Thu, 29 Aug 2024 15:41:47 -0700 Subject: [PATCH 5/5] Merge pull request #806 from jkottiku/master Power stress & performance conf. --- rvs/conf/MI300X-HF/gst_single.conf | 269 +++++++++++++++++++++++++++++ rvs/conf/MI300X-HF/iet_stress.conf | 64 +++++++ 2 files changed, 333 insertions(+) create mode 100644 rvs/conf/MI300X-HF/gst_single.conf create mode 100644 rvs/conf/MI300X-HF/iet_stress.conf diff --git a/rvs/conf/MI300X-HF/gst_single.conf b/rvs/conf/MI300X-HF/gst_single.conf new file mode 100644 index 00000000..c0e30195 --- /dev/null +++ b/rvs/conf/MI300X-HF/gst_single.conf @@ -0,0 +1,269 @@ +# ################################################################################ +# # +# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. +# # +# # MIT LICENSE: +# # Permission is hereby granted, free of charge, to any person obtaining a copy of +# # this software and associated documentation files (the "Software"), to deal in +# # the Software without restriction, including without limitation the rights to +# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# # of the Software, and to permit persons to whom the Software is furnished to do +# # so, subject to the following conditions: +# # +# # The above copyright notice and this permission notice shall be included in all +# # copies or substantial portions of the Software. +# # +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# # SOFTWARE. +# # +# ############################################################################### + +# GST test - gst-1215Tflops-4K4K8K-rand-fp8 +# +# Preconditions: +# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g +# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify +# all the GPUs IDs separated by white space +# Set matrices sizes to 4864 * 4096 * 8192 +# Set matrix data type as fp8 real number +# Set matrix data initialization method as random integer +# Set copy_matrix to false (the matrices will be copied to GPUs only once) +# Set target stress GFLOPS as 1215000 (1215 TFLOPS) +# +# Expected result: +# The test on each GPU passes (TRUE) if the GPU achieves 1215 TFLOPS or more +# within the test duration of 15 seconds after ramp-up duration of 5 seconds. +# Else test on the GPU fails (FALSE). + +actions: +- name: gst-1215Tflops-4K4K8K-rand-fp8 + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 1215000 + matrix_size_a: 4864 + matrix_size_b: 4096 + matrix_size_c: 8192 + matrix_init: rand + data_type: fp8_r + lda: 8320 + ldb: 8320 + ldc: 4992 + ldd: 4992 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + +- name: gst-981Tflops-4K4K8K-trig-fp8 + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 981000 + matrix_size_a: 4864 + matrix_size_b: 4096 + matrix_size_c: 8192 + matrix_init: trig + data_type: fp8_r + lda: 8320 + ldb: 8320 + ldc: 4992 + ldd: 4992 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + +- name: gst-639Tflops-4K4K8K-rand-fp16 + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 639000 + matrix_size_a: 4864 + matrix_size_b: 4096 + matrix_size_c: 8192 + matrix_init: rand + data_type: fp16_r + lda: 8320 + ldb: 8320 + ldc: 4992 + ldd: 4992 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + +- name: gst-523Tflops-4K4K8K-trig-fp16 + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 523000 + matrix_size_a: 4864 + matrix_size_b: 4096 + matrix_size_c: 8192 + matrix_init: trig + data_type: fp16_r + lda: 8320 + ldb: 8320 + ldc: 4992 + ldd: 4992 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + +- name: gst-581Tflops-4K4K8K-rand-bf16 + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 581000 + matrix_size_a: 4864 + matrix_size_b: 4096 + matrix_size_c: 8192 + matrix_init: rand + data_type: bf16_r + lda: 8320 + ldb: 8320 + ldc: 4992 + ldd: 4992 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + +- name: gst-552Tflops-4K4K8K-trig-bf16 + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 552000 + matrix_size_a: 4864 + matrix_size_b: 4096 + matrix_size_c: 8192 + matrix_init: trig + data_type: bf16_r + lda: 8320 + ldb: 8320 + ldc: 4992 + ldd: 4992 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + +- name: gst-100Tflops-3K-trig-sgemm + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 100000 + matrix_size_a: 3072 + matrix_size_b: 3072 + matrix_size_c: 3072 + matrix_init: trig + ops_type: sgemm + lda: 3072 + ldb: 3072 + ldc: 3072 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + +- name: gst-100Tflops-3K-rand-sgemm + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 100000 + matrix_size_a: 3072 + matrix_size_b: 3072 + matrix_size_c: 3072 + matrix_init: rand + ops_type: sgemm + lda: 3072 + ldb: 3072 + ldc: 3072 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + +- name: gst-70Tflops-8K-trig-dgemm + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 70000 + matrix_size_a: 8192 + matrix_size_b: 8192 + matrix_size_c: 8192 + matrix_init: trig + ops_type: dgemm + lda: 8192 + ldb: 8192 + ldc: 8192 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + +- name: gst-70Tflops-8K-rand-dgemm + device: all + module: gst + log_interval: 3000 + ramp_interval: 5000 + duration: 15000 + hot_calls: 1000 + copy_matrix: false + target_stress: 70000 + matrix_size_a: 8192 + matrix_size_b: 8192 + matrix_size_c: 8192 + matrix_init: rand + ops_type: dgemm + lda: 8192 + ldb: 8192 + ldc: 8192 + transa: 1 + transb: 0 + alpha: 1 + beta: 0 + diff --git a/rvs/conf/MI300X-HF/iet_stress.conf b/rvs/conf/MI300X-HF/iet_stress.conf new file mode 100644 index 00000000..952c3c0d --- /dev/null +++ b/rvs/conf/MI300X-HF/iet_stress.conf @@ -0,0 +1,64 @@ +# ################################################################################ +# # +# # Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. +# # +# # MIT LICENSE: +# # Permission is hereby granted, free of charge, to any person obtaining a copy of +# # this software and associated documentation files (the "Software"), to deal in +# # the Software without restriction, including without limitation the rights to +# # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# # of the Software, and to permit persons to whom the Software is furnished to do +# # so, subject to the following conditions: +# # +# # The above copyright notice and this permission notice shall be included in all +# # copies or substantial portions of the Software. +# # +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# # SOFTWARE. +# # +# ############################################################################### + +# IET stress test +# +# Preconditions: +# Set device to all. If you need to run the rvs only on a subset of GPUs, please run rvs with -g +# option, collect the GPUs IDs (e.g.: GPU[ 5 - 50599] -> 50599 is the GPU ID) and then specify +# all the GPUs IDs separated by comma. +# Set parallel execution to true (gemm workload execution on all GPUs in parallel) +# Set gemm operation type as dgemm. +# Set matrix_size to 28000. +# Test duration set to 10 mins. +# Target power set to 850W for each GPU. +# +# Run test with: +# cd bin +# ./rvs -c conf/MI300X-HF/iet_stress.conf +# +# Expected result: +# The test on each GPU passes (TRUE) if the GPU achieves power target of 850W. +# + +actions: +- name: iet-stress-850W-dgemm-true + device: all + module: iet + parallel: true + duration: 600000 + ramp_interval: 10000 + sample_interval: 5000 + log_interval: 5000 + target_power: 850 + matrix_size: 28000 + ops_type: dgemm + lda: 28000 + ldb: 28000 + ldc: 28000 + alpha: 1 + beta: 1 + matrix_init: hiprand +