From 9c9c6e6a7b9be081cacf2fe308ed0759684c3d1e Mon Sep 17 00:00:00 2001 From: Abishek <52214183+r-abishek@users.noreply.github.com> Date: Tue, 7 May 2024 11:56:45 -0700 Subject: [PATCH] Minor common-fixes for HIP (#345) * Use scratchBufferHip * minor fix * remove additional variable use * Add CHECK_RETURN_STATUS to hip API * handle fix --- .../hip/hip_tensor_arithmetic_operations.hpp | 6 +- src/modules/hip/kernel/copy.hpp | 2 +- src/modules/hip/kernel/gamma_correction.hpp | 9 +- src/modules/hip/kernel/gaussian_filter.hpp | 86 +++++++++---------- .../hip/kernel/noise_salt_and_pepper.hpp | 2 +- src/modules/hip/kernel/noise_shot.hpp | 2 +- src/modules/hip/kernel/ricap.hpp | 19 ++-- src/modules/hip/kernel/spatter.hpp | 4 +- src/modules/hip/kernel/warp_affine.hpp | 2 +- src/modules/hip/kernel/water.hpp | 19 ++-- .../rppt_tensor_effects_augmentations.cpp | 11 ++- utilities/test_suite/HIP/Tensor_hip.cpp | 62 ++++++------- utilities/test_suite/HIP/Tensor_voxel_hip.cpp | 36 ++++---- utilities/test_suite/rpp_test_suite_common.h | 12 +-- 14 files changed, 126 insertions(+), 146 deletions(-) diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp index 0345171fc..37d2220b2 100644 --- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp +++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp @@ -22,8 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP -#define HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP +#ifndef HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP +#define HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP #include "kernel/fused_multiply_add_scalar.hpp" #include "kernel/add_scalar.hpp" @@ -31,4 +31,4 @@ SOFTWARE. #include "kernel/multiply_scalar.hpp" #include "kernel/magnitude.hpp" -#endif // HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP +#endif // HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP diff --git a/src/modules/hip/kernel/copy.hpp b/src/modules/hip/kernel/copy.hpp index fdcc7bf6f..45ff91d38 100644 --- a/src/modules/hip/kernel/copy.hpp +++ b/src/modules/hip/kernel/copy.hpp @@ -58,7 +58,7 @@ RppStatus hip_exec_copy_tensor(T *srcPtr, { if (srcDescPtr->layout == dstDescPtr->layout) { - hipMemcpy(dstPtr, srcPtr, dstDescPtr->n * dstDescPtr->strides.nStride * sizeof(T), hipMemcpyDeviceToDevice); + CHECK_RETURN_STATUS(hipMemcpy(dstPtr, srcPtr, dstDescPtr->n * dstDescPtr->strides.nStride * sizeof(T), hipMemcpyDeviceToDevice)); } else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) { diff --git a/src/modules/hip/kernel/gamma_correction.hpp b/src/modules/hip/kernel/gamma_correction.hpp index f98790de3..04acdd1b3 100644 --- a/src/modules/hip/kernel/gamma_correction.hpp +++ b/src/modules/hip/kernel/gamma_correction.hpp @@ -227,9 +227,7 @@ RppStatus hip_exec_gamma_correction_tensor(T *srcPtr, int globalThreads_y = handle.GetBatchSize(); int globalThreads_z = 1; - float *gammaLUT; - hipMalloc(&gammaLUT, 256 * handle.GetBatchSize() * sizeof(Rpp32f)); - + Rpp32f *gammaLUT = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; hipLaunchKernelGGL(gamma_correction_lut_compute, dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)), dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), @@ -238,9 +236,6 @@ RppStatus hip_exec_gamma_correction_tensor(T *srcPtr, gammaLUT, handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem); - - - globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3; globalThreads_y = dstDescPtr->h; globalThreads_z = handle.GetBatchSize(); @@ -307,7 +302,5 @@ RppStatus hip_exec_gamma_correction_tensor(T *srcPtr, } } - hipFree(&gammaLUT); - return RPP_SUCCESS; } diff --git a/src/modules/hip/kernel/gaussian_filter.hpp b/src/modules/hip/kernel/gaussian_filter.hpp index b2ae7a878..8e4336be1 100644 --- a/src/modules/hip/kernel/gaussian_filter.hpp +++ b/src/modules/hip/kernel/gaussian_filter.hpp @@ -1912,9 +1912,6 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor, Rpp32f *stdDevTensor, rpp::Handle &handle) { - int localThreads_x = 256; - int localThreads_y = 1; - int localThreads_z = 1; int globalThreads_x = handle.GetBatchSize(); int globalThreads_y = 1; int globalThreads_z = 1; @@ -1922,8 +1919,8 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor, if (kernelSize == 3) { hipLaunchKernelGGL(create_gaussian_kernel_3x3, - dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), 0, handle.GetStream(), filterTensor, @@ -1933,8 +1930,8 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor, else if (kernelSize == 5) { hipLaunchKernelGGL(create_gaussian_kernel_5x5, - dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), 0, handle.GetStream(), filterTensor, @@ -1944,8 +1941,8 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor, else if (kernelSize == 7) { hipLaunchKernelGGL(create_gaussian_kernel_7x7, - dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), 0, handle.GetStream(), filterTensor, @@ -1955,8 +1952,8 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor, else if (kernelSize == 9) { hipLaunchKernelGGL(create_gaussian_kernel_9x9, - dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), 0, handle.GetStream(), filterTensor, @@ -1982,9 +1979,6 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, if (roiType == RpptRoiType::LTRB) hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); - int localThreads_x = LOCAL_THREADS_X; - int localThreads_y = LOCAL_THREADS_Y; - int localThreads_z = LOCAL_THREADS_Z; int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3; int globalThreads_y = dstDescPtr->h; int globalThreads_z = handle.GetBatchSize(); @@ -2010,8 +2004,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, if (kernelSize == 3) { hipLaunchKernelGGL(gaussian_filter_3x3_pkd_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2026,8 +2020,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 5) { hipLaunchKernelGGL(gaussian_filter_5x5_pkd_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2042,8 +2036,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 7) { hipLaunchKernelGGL(gaussian_filter_7x7_pkd_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2058,8 +2052,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 9) { hipLaunchKernelGGL(gaussian_filter_9x9_pkd_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2077,8 +2071,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, if (kernelSize == 3) { hipLaunchKernelGGL(gaussian_filter_3x3_pln_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2094,8 +2088,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 5) { hipLaunchKernelGGL(gaussian_filter_5x5_pln_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2111,8 +2105,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 7) { hipLaunchKernelGGL(gaussian_filter_7x7_pln_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2128,8 +2122,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 9) { hipLaunchKernelGGL(gaussian_filter_9x9_pln_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2150,8 +2144,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, if (kernelSize == 3) { hipLaunchKernelGGL(gaussian_filter_3x3_pkd3_pln3_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2166,8 +2160,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 5) { hipLaunchKernelGGL(gaussian_filter_5x5_pkd3_pln3_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2182,8 +2176,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 7) { hipLaunchKernelGGL(gaussian_filter_7x7_pkd3_pln3_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2198,8 +2192,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 9) { hipLaunchKernelGGL(gaussian_filter_9x9_pkd3_pln3_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2219,8 +2213,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, if (kernelSize == 3) { hipLaunchKernelGGL(gaussian_filter_3x3_pln3_pkd3_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2235,8 +2229,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 5) { hipLaunchKernelGGL(gaussian_filter_5x5_pln3_pkd3_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2251,8 +2245,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 7) { hipLaunchKernelGGL(gaussian_filter_7x7_pln3_pkd3_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -2267,8 +2261,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr, else if (kernelSize == 9) { hipLaunchKernelGGL(gaussian_filter_9x9_pln3_pkd3_tensor, - dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, diff --git a/src/modules/hip/kernel/noise_salt_and_pepper.hpp b/src/modules/hip/kernel/noise_salt_and_pepper.hpp index bb377417a..6564ed9c9 100644 --- a/src/modules/hip/kernel/noise_salt_and_pepper.hpp +++ b/src/modules/hip/kernel/noise_salt_and_pepper.hpp @@ -279,7 +279,7 @@ RppStatus hip_exec_salt_and_pepper_noise_tensor(T *srcPtr, Rpp32u *xorwowSeedStream; xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1]; - hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice); + CHECK_RETURN_STATUS(hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice)); if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) { diff --git a/src/modules/hip/kernel/noise_shot.hpp b/src/modules/hip/kernel/noise_shot.hpp index 77e72d192..a28b0d19f 100644 --- a/src/modules/hip/kernel/noise_shot.hpp +++ b/src/modules/hip/kernel/noise_shot.hpp @@ -320,7 +320,7 @@ RppStatus hip_exec_shot_noise_tensor(T *srcPtr, Rpp32u *xorwowSeedStream; xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1]; - hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice); + CHECK_RETURN_STATUS(hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice)); if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) { diff --git a/src/modules/hip/kernel/ricap.hpp b/src/modules/hip/kernel/ricap.hpp index 795538451..4f72d57d2 100644 --- a/src/modules/hip/kernel/ricap.hpp +++ b/src/modules/hip/kernel/ricap.hpp @@ -174,9 +174,6 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr, if (roiType == RpptRoiType::LTRB) hip_exec_roi_converison_ltrb_to_xywh(roiPtrInputCropRegion, handle); - int localThreads_x = LOCAL_THREADS_X; - int localThreads_y = LOCAL_THREADS_Y; - int localThreads_z = LOCAL_THREADS_Z; int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3; int globalThreads_y = dstDescPtr->h; int globalThreads_z = handle.GetBatchSize(); @@ -184,8 +181,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr, if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) { hipLaunchKernelGGL(ricap_pkd_tensor, - dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -199,8 +196,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr, else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) { hipLaunchKernelGGL(ricap_pln_tensor, - dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -217,8 +214,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr, if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) { hipLaunchKernelGGL(ricap_pkd3_pln3_tensor, - dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -233,8 +230,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr, { globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; hipLaunchKernelGGL(ricap_pln3_pkd3_tensor, - dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, diff --git a/src/modules/hip/kernel/spatter.hpp b/src/modules/hip/kernel/spatter.hpp index 3df78b4c3..ab0831c07 100644 --- a/src/modules/hip/kernel/spatter.hpp +++ b/src/modules/hip/kernel/spatter.hpp @@ -243,8 +243,8 @@ RppStatus hip_exec_spatter_tensor(T *srcPtr, float *spatterMaskPtr, *spatterMaskInvPtr; spatterMaskPtr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; spatterMaskInvPtr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem + maskSize; - hipMemcpy(spatterMaskPtr, spatterMask, maskSizeFloat, hipMemcpyHostToDevice); - hipMemcpy(spatterMaskInvPtr, spatterMaskInv, maskSizeFloat, hipMemcpyHostToDevice); + CHECK_RETURN_STATUS(hipMemcpy(spatterMaskPtr, spatterMask, maskSizeFloat, hipMemcpyHostToDevice)); + CHECK_RETURN_STATUS(hipMemcpy(spatterMaskInvPtr, spatterMaskInv, maskSizeFloat, hipMemcpyHostToDevice)); if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) { diff --git a/src/modules/hip/kernel/warp_affine.hpp b/src/modules/hip/kernel/warp_affine.hpp index 8de7a6697..451501e36 100644 --- a/src/modules/hip/kernel/warp_affine.hpp +++ b/src/modules/hip/kernel/warp_affine.hpp @@ -330,7 +330,7 @@ RppStatus hip_exec_warp_affine_tensor(T *srcPtr, int globalThreads_z = handle.GetBatchSize(); float *affineTensorPtr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; - hipMemcpy(affineTensorPtr, affineTensor, 6 * handle.GetBatchSize() * sizeof(float), hipMemcpyHostToDevice); + CHECK_RETURN_STATUS(hipMemcpy(affineTensorPtr, affineTensor, 6 * handle.GetBatchSize() * sizeof(float), hipMemcpyHostToDevice)); if (interpolationType == RpptInterpolationType::BILINEAR) { diff --git a/src/modules/hip/kernel/water.hpp b/src/modules/hip/kernel/water.hpp index e496b1670..e808a5ed2 100644 --- a/src/modules/hip/kernel/water.hpp +++ b/src/modules/hip/kernel/water.hpp @@ -222,9 +222,6 @@ RppStatus hip_exec_water_tensor(T *srcPtr, if (roiType == RpptRoiType::XYWH) hip_exec_roi_converison_xywh_to_ltrb(roiTensorPtrSrc, handle); - int localThreads_x = 16; - int localThreads_y = 16; - int localThreads_z = 1; int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3; int globalThreads_y = dstDescPtr->h; int globalThreads_z = handle.GetBatchSize(); @@ -232,8 +229,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr, if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) { hipLaunchKernelGGL(water_pkd_tensor, - dim3(ceil(static_cast(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil(static_cast(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -251,8 +248,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr, else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) { hipLaunchKernelGGL(water_pln_tensor, - dim3(ceil(static_cast(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil(static_cast(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -273,8 +270,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr, if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) { hipLaunchKernelGGL(water_pkd3_pln3_tensor, - dim3(ceil(static_cast(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil(static_cast(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, @@ -293,8 +290,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr, { globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; hipLaunchKernelGGL(water_pln3_pkd3_tensor, - dim3(ceil(static_cast(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), - dim3(localThreads_x, localThreads_y, localThreads_z), + dim3(ceil(static_cast(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), 0, handle.GetStream(), srcPtr, diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp index f089b7cde..24cc8e032 100644 --- a/src/modules/rppt_tensor_effects_augmentations.cpp +++ b/src/modules/rppt_tensor_effects_augmentations.cpp @@ -895,7 +895,7 @@ RppStatus rppt_spatter_gpu(RppPtr_t srcPtr, { #ifdef HIP_COMPILE RpptROI roiTensorPtrSrcHost[dstDescPtr->n]; - hipMemcpy(roiTensorPtrSrcHost, roiTensorPtrSrc, dstDescPtr->n * sizeof(RpptROI), hipMemcpyDeviceToHost); + CHECK_RETURN_STATUS(hipMemcpy(roiTensorPtrSrcHost, roiTensorPtrSrc, dstDescPtr->n * sizeof(RpptROI), hipMemcpyDeviceToHost)); if (roiType == RpptRoiType::XYWH) { for(int i = 0; i < dstDescPtr->n; i++) @@ -1011,7 +1011,7 @@ RppStatus rppt_salt_and_pepper_noise_gpu(RppPtr_t srcPtr, RpptXorwowState *d_xorwowInitialStatePtr; d_xorwowInitialStatePtr = (RpptXorwowState *) rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; - hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowState), hipMemcpyHostToDevice); + CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowState), hipMemcpyHostToDevice)); if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) { @@ -1096,7 +1096,7 @@ RppStatus rppt_shot_noise_gpu(RppPtr_t srcPtr, RpptXorwowStateBoxMuller *d_xorwowInitialStatePtr; d_xorwowInitialStatePtr = (RpptXorwowStateBoxMuller *) rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; - hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice); + CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice)); if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) { @@ -1451,9 +1451,8 @@ RppStatus rppt_ricap_gpu(RppPtr_t srcPtr, #ifdef HIP_COMPILE if(srcDescPtr->n == 1) // BatchSize should always be greater than 1 return RPP_ERROR; - Rpp32u* permutationHipTensor; - hipMalloc(&permutationHipTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n); - hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice); + Rpp32u *permutationHipTensor = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); + CHECK_RETURN_STATUS(hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice)); if ((check_roi_out_of_bounds(&roiPtrInputCropRegion[0],srcDescPtr,roiType) == -1) || (check_roi_out_of_bounds(&roiPtrInputCropRegion[1],srcDescPtr,roiType) == -1) diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 1ee8dd3b4..23c3f19fe 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -250,12 +250,12 @@ int main(int argc, char **argv) // Initialize ROI tensors for src/dst RpptROI *roiTensorPtrSrc, *roiTensorPtrDst; - CHECK(hipHostMalloc(&roiTensorPtrSrc, batchSize * sizeof(RpptROI))); - CHECK(hipHostMalloc(&roiTensorPtrDst, batchSize * sizeof(RpptROI))); + CHECK_RETURN_STATUS(hipHostMalloc(&roiTensorPtrSrc, batchSize * sizeof(RpptROI))); + CHECK_RETURN_STATUS(hipHostMalloc(&roiTensorPtrDst, batchSize * sizeof(RpptROI))); // Initialize the ImagePatch for dst RpptImagePatch *dstImgSizes; - CHECK(hipHostMalloc(&dstImgSizes, batchSize * sizeof(RpptImagePatch))); + CHECK_RETURN_STATUS(hipHostMalloc(&dstImgSizes, batchSize * sizeof(RpptImagePatch))); // Set ROI tensors types for src/dst RpptRoiType roiTypeSrc, roiTypeDst; @@ -315,7 +315,7 @@ int main(int argc, char **argv) // Run case-wise RPP API and measure time rppHandle_t handle; hipStream_t stream; - CHECK(hipStreamCreate(&stream)); + CHECK_RETURN_STATUS(hipStreamCreate(&stream)); rppCreateWithStreamAndBatchSize(&handle, stream, batchSize); int noOfIterations = (int)imageNames.size() / batchSize; @@ -333,30 +333,30 @@ int main(int argc, char **argv) bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u); else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32)) bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f - CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize)); + CHECK_RETURN_STATUS(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize)); } // Allocate hip memory for src/dst - CHECK(hipMalloc(&d_input, inputBufferSize)); - CHECK(hipMalloc(&d_output, outputBufferSize)); + CHECK_RETURN_STATUS(hipMalloc(&d_input, inputBufferSize)); + CHECK_RETURN_STATUS(hipMalloc(&d_output, outputBufferSize)); if(dualInputCase) - CHECK(hipMalloc(&d_input_second, inputBufferSize)); + CHECK_RETURN_STATUS(hipMalloc(&d_input_second, inputBufferSize)); RpptROI *roiPtrInputCropRegion; if(testCase == 82) - CHECK(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI))); + CHECK_RETURN_STATUS(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI))); RpptROI *cropRoi, *patchRoi; if(testCase == 33) { - CHECK(hipHostMalloc(&cropRoi, batchSize * sizeof(RpptROI))); - CHECK(hipHostMalloc(&patchRoi, batchSize * sizeof(RpptROI))); + CHECK_RETURN_STATUS(hipHostMalloc(&cropRoi, batchSize * sizeof(RpptROI))); + CHECK_RETURN_STATUS(hipHostMalloc(&patchRoi, batchSize * sizeof(RpptROI))); } bool invalidROI = (roiList[0] == 0 && roiList[1] == 0 && roiList[2] == 0 && roiList[3] == 0); Rpp32f *intensity; if(testCase == 46) - CHECK(hipHostMalloc(&intensity, batchSize * sizeof(Rpp32f))); + CHECK_RETURN_STATUS(hipHostMalloc(&intensity, batchSize * sizeof(Rpp32f))); // case-wise RPP API and measure time script for Unit and Performance test printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize); @@ -396,10 +396,10 @@ int main(int argc, char **argv) convert_input_bitdepth(input, input_second, inputu8, inputu8Second, inputBitDepth, ioBufferSize, inputBufferSize, srcDescPtr, dualInputCase, conversionFactor); //copy decoded inputs to hip buffers - CHECK(hipMemcpy(d_input, input, inputBufferSize, hipMemcpyHostToDevice)); - CHECK(hipMemcpy(d_output, output, outputBufferSize, hipMemcpyHostToDevice)); + CHECK_RETURN_STATUS(hipMemcpy(d_input, input, inputBufferSize, hipMemcpyHostToDevice)); + CHECK_RETURN_STATUS(hipMemcpy(d_output, output, outputBufferSize, hipMemcpyHostToDevice)); if(dualInputCase) - CHECK(hipMemcpy(d_input_second, input_second, inputBufferSize, hipMemcpyHostToDevice)); + CHECK_RETURN_STATUS(hipMemcpy(d_input_second, input_second, inputBufferSize, hipMemcpyHostToDevice)); int roiHeightList[batchSize], roiWidthList[batchSize]; if(invalidROI) @@ -677,8 +677,8 @@ int main(int argc, char **argv) testCaseName = "lut"; Rpp32f *lutBuffer; - CHECK(hipHostMalloc(&lutBuffer, 65536 * sizeof(Rpp32f))); - CHECK(hipMemset(lutBuffer, 0, 65536 * sizeof(Rpp32f))); + CHECK_RETURN_STATUS(hipHostMalloc(&lutBuffer, 65536 * sizeof(Rpp32f))); + CHECK_RETURN_STATUS(hipMemset(lutBuffer, 0, 65536 * sizeof(Rpp32f))); Rpp8u *lut8u = reinterpret_cast(lutBuffer); Rpp16f *lut16f = reinterpret_cast(lutBuffer); Rpp32f *lut32f = reinterpret_cast(lutBuffer); @@ -710,7 +710,7 @@ int main(int argc, char **argv) break; - CHECK(hipHostFree(lutBuffer)); + CHECK_RETURN_STATUS(hipHostFree(lutBuffer)); } case 36: { @@ -1140,7 +1140,7 @@ int main(int argc, char **argv) break; } - CHECK(hipDeviceSynchronize()); + CHECK_RETURN_STATUS(hipDeviceSynchronize()); endWallTime = omp_get_wtime(); wallTime = endWallTime - startWallTime; if (missingFuncFlag == 1) @@ -1215,7 +1215,7 @@ int main(int argc, char **argv) } else { - CHECK(hipMemcpy(output, d_output, outputBufferSize, hipMemcpyDeviceToHost)); + CHECK_RETURN_STATUS(hipMemcpy(output, d_output, outputBufferSize, hipMemcpyDeviceToHost)); // Reconvert other bit depths to 8u for output display purposes convert_output_bitdepth_to_u8(output, outputu8, inputBitDepth, oBufferSize, outputBufferSize, dstDescPtr, invConversionFactor); @@ -1278,29 +1278,29 @@ int main(int argc, char **argv) } // Free memory - CHECK(hipHostFree(roiTensorPtrSrc)); - CHECK(hipHostFree(roiTensorPtrDst)); - CHECK(hipHostFree(dstImgSizes)); + CHECK_RETURN_STATUS(hipHostFree(roiTensorPtrSrc)); + CHECK_RETURN_STATUS(hipHostFree(roiTensorPtrDst)); + CHECK_RETURN_STATUS(hipHostFree(dstImgSizes)); if(testCase == 46) - CHECK(hipHostFree(intensity)); + CHECK_RETURN_STATUS(hipHostFree(intensity)); if(testCase == 82) - CHECK(hipHostFree(roiPtrInputCropRegion)); + CHECK_RETURN_STATUS(hipHostFree(roiPtrInputCropRegion)); if(testCase == 33) { - CHECK(hipHostFree(cropRoi)); - CHECK(hipHostFree(patchRoi)); + CHECK_RETURN_STATUS(hipHostFree(cropRoi)); + CHECK_RETURN_STATUS(hipHostFree(patchRoi)); } if (reductionTypeCase) - CHECK(hipHostFree(reductionFuncResultArr)); + CHECK_RETURN_STATUS(hipHostFree(reductionFuncResultArr)); free(input); free(input_second); free(output); free(inputu8); free(inputu8Second); free(outputu8); - CHECK(hipFree(d_input)); + CHECK_RETURN_STATUS(hipFree(d_input)); if(dualInputCase) - CHECK(hipFree(d_input_second)); - CHECK(hipFree(d_output)); + CHECK_RETURN_STATUS(hipFree(d_input_second)); + CHECK_RETURN_STATUS(hipFree(d_output)); return 0; } diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp index 5834e4307..b39f461cc 100644 --- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp @@ -122,7 +122,7 @@ int main(int argc, char * argv[]) // set src/dst xyzwhd ROI tensors void *pinnedMemROI; - CHECK(hipHostMalloc(&pinnedMemROI, noOfFiles * sizeof(RpptROI3D))); + CHECK_RETURN_STATUS(hipHostMalloc(&pinnedMemROI, noOfFiles * sizeof(RpptROI3D))); RpptROI3D *roiGenericSrcPtr = reinterpret_cast(pinnedMemROI); // Set buffer sizes in pixels for src/dst @@ -139,16 +139,16 @@ int main(int argc, char * argv[]) // Allocate hip memory in float for RPP strided buffer void *d_inputF32, *d_outputF32; - CHECK(hipMalloc(&d_inputF32, iBufferSizeInBytes)); - CHECK(hipMalloc(&d_outputF32, oBufferSizeInBytes)); + CHECK_RETURN_STATUS(hipMalloc(&d_inputF32, iBufferSizeInBytes)); + CHECK_RETURN_STATUS(hipMalloc(&d_outputF32, oBufferSizeInBytes)); // set argument tensors void *pinnedMemArgs; - CHECK(hipHostMalloc(&pinnedMemArgs, 2 * noOfFiles * sizeof(Rpp32f))); + CHECK_RETURN_STATUS(hipHostMalloc(&pinnedMemArgs, 2 * noOfFiles * sizeof(Rpp32f))); rppHandle_t handle; hipStream_t stream; - CHECK(hipStreamCreate(&stream)); + CHECK_RETURN_STATUS(hipStreamCreate(&stream)); rppCreateWithStreamAndBatchSize(&handle, stream, batchSize); // Run case-wise RPP API and measure time @@ -166,8 +166,8 @@ int main(int argc, char * argv[]) inputU8 = static_cast(calloc(iBufferSizeU8, 1)); outputU8 = static_cast(calloc(iBufferSizeU8, 1)); - CHECK(hipMalloc(&d_inputU8, iBufferSizeU8)); - CHECK(hipMalloc(&d_outputU8, iBufferSizeU8)); + CHECK_RETURN_STATUS(hipMalloc(&d_inputU8, iBufferSizeU8)); + CHECK_RETURN_STATUS(hipMalloc(&d_outputU8, iBufferSizeU8)); } printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", funcName.c_str(), numRuns, batchSize); @@ -220,11 +220,11 @@ int main(int argc, char * argv[]) { for(int i = 0; i < iBufferSizeU8; i++) inputU8[i] = std::min(std::max(static_cast(inputF32[i]), static_cast(0)), static_cast(255)); - CHECK(hipMemcpy(d_inputU8, inputU8, iBufferSizeU8, hipMemcpyHostToDevice)); + CHECK_RETURN_STATUS(hipMemcpy(d_inputU8, inputU8, iBufferSizeU8, hipMemcpyHostToDevice)); } //Copy input buffer to hip - CHECK(hipMemcpy(d_inputF32, inputF32, iBufferSizeInBytes, hipMemcpyHostToDevice)); + CHECK_RETURN_STATUS(hipMemcpy(d_inputF32, inputF32, iBufferSizeInBytes, hipMemcpyHostToDevice)); for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++) { @@ -367,7 +367,7 @@ int main(int argc, char * argv[]) } } - CHECK(hipDeviceSynchronize()); + CHECK_RETURN_STATUS(hipDeviceSynchronize()); endWallTime = omp_get_wtime(); wallTime = endWallTime - startWallTime; maxWallTime = std::max(maxWallTime, wallTime); @@ -383,7 +383,7 @@ int main(int argc, char * argv[]) } // Copy output buffer to host - CHECK(hipMemcpy(outputF32, d_outputF32, oBufferSizeInBytes, hipMemcpyDeviceToHost)); + CHECK_RETURN_STATUS(hipMemcpy(outputF32, d_outputF32, oBufferSizeInBytes, hipMemcpyDeviceToHost)); if(testType == 0) { cout << "\n\nGPU Backend Wall Time: " << wallTime <<" ms per batch"<< endl; @@ -406,7 +406,7 @@ int main(int argc, char * argv[]) if(inputBitDepth == 0) { Rpp64u bufferLength = iBufferSize * sizeof(Rpp8u) + descriptorPtr3D->offsetInBytes; - CHECK(hipMemcpy(outputU8, d_outputU8, bufferLength, hipMemcpyDeviceToHost)); + CHECK_RETURN_STATUS(hipMemcpy(outputU8, d_outputU8, bufferLength, hipMemcpyDeviceToHost)); // Copy U8 buffer to F32 buffer for display purposes for(int i = 0; i < bufferLength; i++) @@ -498,10 +498,10 @@ int main(int argc, char * argv[]) free(niftiDataArray); free(inputF32); free(outputF32); - CHECK(hipHostFree(pinnedMemROI)); - CHECK(hipHostFree(pinnedMemArgs)); - CHECK(hipFree(d_inputF32)); - CHECK(hipFree(d_outputF32)); + CHECK_RETURN_STATUS(hipHostFree(pinnedMemROI)); + CHECK_RETURN_STATUS(hipHostFree(pinnedMemArgs)); + CHECK_RETURN_STATUS(hipFree(d_inputF32)); + CHECK_RETURN_STATUS(hipFree(d_outputF32)); if(inputBitDepth == 0) { if(inputU8 != NULL) @@ -509,9 +509,9 @@ int main(int argc, char * argv[]) if(outputU8 != NULL) free(outputU8); if(d_inputU8 != NULL) - CHECK(hipFree(d_inputU8)); + CHECK_RETURN_STATUS(hipFree(d_inputU8)); if(d_outputU8 != NULL) - CHECK(hipFree(d_outputU8)); + CHECK_RETURN_STATUS(hipFree(d_outputU8)); } return(0); diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h index 8360a846c..3ec123d7f 100644 --- a/utilities/test_suite/rpp_test_suite_common.h +++ b/utilities/test_suite/rpp_test_suite_common.h @@ -59,12 +59,12 @@ using namespace std; #define GOLDEN_OUTPUT_MAX_HEIGHT 150 // Golden outputs are generated with MAX_HEIGHT set to 150. Changing this constant will result in QA test failures #define GOLDEN_OUTPUT_MAX_WIDTH 150 // Golden outputs are generated with MAX_WIDTH set to 150. Changing this constant will result in QA test failures -#define CHECK(x) do { \ - int retval = (x); \ - if (retval != 0) { \ - fprintf(stderr, "Runtime error: %s returned %d at %s:%d", #x, retval, __FILE__, __LINE__); \ - exit(-1); \ - } \ +#define CHECK_RETURN_STATUS(x) do { \ + int retval = (x); \ + if (retval != 0) { \ + fprintf(stderr, "Runtime error: %s returned %d at %s:%d", #x, retval, __FILE__, __LINE__); \ + exit(-1); \ + } \ } while (0) std::map augmentationMap =