From 9c9c6e6a7b9be081cacf2fe308ed0759684c3d1e Mon Sep 17 00:00:00 2001
From: Abishek <52214183+r-abishek@users.noreply.github.com>
Date: Tue, 7 May 2024 11:56:45 -0700
Subject: [PATCH] Minor common-fixes for HIP (#345)

* Use scratchBufferHip

* minor fix

* remove additional variable use

* Add CHECK_RETURN_STATUS to hip API

* handle fix
---
 .../hip/hip_tensor_arithmetic_operations.hpp  |  6 +-
 src/modules/hip/kernel/copy.hpp               |  2 +-
 src/modules/hip/kernel/gamma_correction.hpp   |  9 +-
 src/modules/hip/kernel/gaussian_filter.hpp    | 86 +++++++++----------
 .../hip/kernel/noise_salt_and_pepper.hpp      |  2 +-
 src/modules/hip/kernel/noise_shot.hpp         |  2 +-
 src/modules/hip/kernel/ricap.hpp              | 19 ++--
 src/modules/hip/kernel/spatter.hpp            |  4 +-
 src/modules/hip/kernel/warp_affine.hpp        |  2 +-
 src/modules/hip/kernel/water.hpp              | 19 ++--
 .../rppt_tensor_effects_augmentations.cpp     | 11 ++-
 utilities/test_suite/HIP/Tensor_hip.cpp       | 62 ++++++-------
 utilities/test_suite/HIP/Tensor_voxel_hip.cpp | 36 ++++----
 utilities/test_suite/rpp_test_suite_common.h  | 12 +--
 14 files changed, 126 insertions(+), 146 deletions(-)

diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
index 0345171fc..37d2220b2 100644
--- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp
+++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
@@ -22,8 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
-#ifndef HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
-#define HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
+#ifndef HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP
+#define HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP
 
 #include "kernel/fused_multiply_add_scalar.hpp"
 #include "kernel/add_scalar.hpp"
@@ -31,4 +31,4 @@ SOFTWARE.
 #include "kernel/multiply_scalar.hpp"
 #include "kernel/magnitude.hpp"
 
-#endif // HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
+#endif // HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/copy.hpp b/src/modules/hip/kernel/copy.hpp
index fdcc7bf6f..45ff91d38 100644
--- a/src/modules/hip/kernel/copy.hpp
+++ b/src/modules/hip/kernel/copy.hpp
@@ -58,7 +58,7 @@ RppStatus hip_exec_copy_tensor(T *srcPtr,
 {
     if (srcDescPtr->layout == dstDescPtr->layout)
     {
-        hipMemcpy(dstPtr, srcPtr, dstDescPtr->n * dstDescPtr->strides.nStride * sizeof(T), hipMemcpyDeviceToDevice);
+        CHECK_RETURN_STATUS(hipMemcpy(dstPtr, srcPtr, dstDescPtr->n * dstDescPtr->strides.nStride * sizeof(T), hipMemcpyDeviceToDevice));
     }
     else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
     {
diff --git a/src/modules/hip/kernel/gamma_correction.hpp b/src/modules/hip/kernel/gamma_correction.hpp
index f98790de3..04acdd1b3 100644
--- a/src/modules/hip/kernel/gamma_correction.hpp
+++ b/src/modules/hip/kernel/gamma_correction.hpp
@@ -227,9 +227,7 @@ RppStatus hip_exec_gamma_correction_tensor(T *srcPtr,
     int globalThreads_y = handle.GetBatchSize();
     int globalThreads_z = 1;
 
-    float *gammaLUT;
-    hipMalloc(&gammaLUT, 256 * handle.GetBatchSize() * sizeof(Rpp32f));
-
+    Rpp32f *gammaLUT = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
     hipLaunchKernelGGL(gamma_correction_lut_compute,
                        dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
                        dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
@@ -238,9 +236,6 @@ RppStatus hip_exec_gamma_correction_tensor(T *srcPtr,
                        gammaLUT,
                        handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem);
 
-    
-    
-    
     globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
     globalThreads_y = dstDescPtr->h;
     globalThreads_z = handle.GetBatchSize();
@@ -307,7 +302,5 @@ RppStatus hip_exec_gamma_correction_tensor(T *srcPtr,
         }
     }
 
-    hipFree(&gammaLUT);
-
     return RPP_SUCCESS;
 }
diff --git a/src/modules/hip/kernel/gaussian_filter.hpp b/src/modules/hip/kernel/gaussian_filter.hpp
index b2ae7a878..8e4336be1 100644
--- a/src/modules/hip/kernel/gaussian_filter.hpp
+++ b/src/modules/hip/kernel/gaussian_filter.hpp
@@ -1912,9 +1912,6 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor,
                                                  Rpp32f *stdDevTensor,
                                                  rpp::Handle &handle)
 {
-    int localThreads_x = 256;
-    int localThreads_y = 1;
-    int localThreads_z = 1;
     int globalThreads_x = handle.GetBatchSize();
     int globalThreads_y = 1;
     int globalThreads_z = 1;
@@ -1922,8 +1919,8 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor,
     if (kernelSize == 3)
     {
         hipLaunchKernelGGL(create_gaussian_kernel_3x3,
-                           dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
-                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+                           dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
                            0,
                            handle.GetStream(),
                            filterTensor,
@@ -1933,8 +1930,8 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor,
     else if (kernelSize == 5)
     {
         hipLaunchKernelGGL(create_gaussian_kernel_5x5,
-                           dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
-                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+                           dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
                            0,
                            handle.GetStream(),
                            filterTensor,
@@ -1944,8 +1941,8 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor,
     else if (kernelSize == 7)
     {
         hipLaunchKernelGGL(create_gaussian_kernel_7x7,
-                           dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
-                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+                           dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
                            0,
                            handle.GetStream(),
                            filterTensor,
@@ -1955,8 +1952,8 @@ static RppStatus hip_exec_create_gaussian_kernel(Rpp32f *filterTensor,
     else if (kernelSize == 9)
     {
         hipLaunchKernelGGL(create_gaussian_kernel_9x9,
-                           dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
-                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+                           dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
                            0,
                            handle.GetStream(),
                            filterTensor,
@@ -1982,9 +1979,6 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
     if (roiType == RpptRoiType::LTRB)
         hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
 
-    int localThreads_x = LOCAL_THREADS_X;
-    int localThreads_y = LOCAL_THREADS_Y;
-    int localThreads_z = LOCAL_THREADS_Z;
     int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
     int globalThreads_y = dstDescPtr->h;
     int globalThreads_z = handle.GetBatchSize();
@@ -2010,8 +2004,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
         if (kernelSize == 3)
         {
             hipLaunchKernelGGL(gaussian_filter_3x3_pkd_tensor,
-                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -2026,8 +2020,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
         else if (kernelSize == 5)
         {
             hipLaunchKernelGGL(gaussian_filter_5x5_pkd_tensor,
-                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -2042,8 +2036,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
         else if (kernelSize == 7)
         {
             hipLaunchKernelGGL(gaussian_filter_7x7_pkd_tensor,
-                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -2058,8 +2052,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
         else if (kernelSize == 9)
         {
             hipLaunchKernelGGL(gaussian_filter_9x9_pkd_tensor,
-                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -2077,8 +2071,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
         if (kernelSize == 3)
         {
             hipLaunchKernelGGL(gaussian_filter_3x3_pln_tensor,
-                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -2094,8 +2088,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
         else if (kernelSize == 5)
         {
             hipLaunchKernelGGL(gaussian_filter_5x5_pln_tensor,
-                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -2111,8 +2105,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
         else if (kernelSize == 7)
         {
             hipLaunchKernelGGL(gaussian_filter_7x7_pln_tensor,
-                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -2128,8 +2122,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
         else if (kernelSize == 9)
         {
             hipLaunchKernelGGL(gaussian_filter_9x9_pln_tensor,
-                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -2150,8 +2144,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
             if (kernelSize == 3)
             {
                 hipLaunchKernelGGL(gaussian_filter_3x3_pkd3_pln3_tensor,
-                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                                   dim3(localThreads_x, localThreads_y, localThreads_z),
+                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                    0,
                                    handle.GetStream(),
                                    srcPtr,
@@ -2166,8 +2160,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
             else if (kernelSize == 5)
             {
                 hipLaunchKernelGGL(gaussian_filter_5x5_pkd3_pln3_tensor,
-                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                                   dim3(localThreads_x, localThreads_y, localThreads_z),
+                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                    0,
                                    handle.GetStream(),
                                    srcPtr,
@@ -2182,8 +2176,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
             else if (kernelSize == 7)
             {
                 hipLaunchKernelGGL(gaussian_filter_7x7_pkd3_pln3_tensor,
-                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                                   dim3(localThreads_x, localThreads_y, localThreads_z),
+                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                    0,
                                    handle.GetStream(),
                                    srcPtr,
@@ -2198,8 +2192,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
             else if (kernelSize == 9)
             {
                 hipLaunchKernelGGL(gaussian_filter_9x9_pkd3_pln3_tensor,
-                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                                   dim3(localThreads_x, localThreads_y, localThreads_z),
+                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                    0,
                                    handle.GetStream(),
                                    srcPtr,
@@ -2219,8 +2213,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
             if (kernelSize == 3)
             {
                 hipLaunchKernelGGL(gaussian_filter_3x3_pln3_pkd3_tensor,
-                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                                   dim3(localThreads_x, localThreads_y, localThreads_z),
+                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                    0,
                                    handle.GetStream(),
                                    srcPtr,
@@ -2235,8 +2229,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
             else if (kernelSize == 5)
             {
                 hipLaunchKernelGGL(gaussian_filter_5x5_pln3_pkd3_tensor,
-                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                                   dim3(localThreads_x, localThreads_y, localThreads_z),
+                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                    0,
                                    handle.GetStream(),
                                    srcPtr,
@@ -2251,8 +2245,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
             else if (kernelSize == 7)
             {
                 hipLaunchKernelGGL(gaussian_filter_7x7_pln3_pkd3_tensor,
-                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                                   dim3(localThreads_x, localThreads_y, localThreads_z),
+                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                    0,
                                    handle.GetStream(),
                                    srcPtr,
@@ -2267,8 +2261,8 @@ RppStatus hip_exec_gaussian_filter_tensor(T *srcPtr,
             else if (kernelSize == 9)
             {
                 hipLaunchKernelGGL(gaussian_filter_9x9_pln3_pkd3_tensor,
-                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/localThreads_z)),
-                                   dim3(localThreads_x, localThreads_y, localThreads_z),
+                                   dim3(ceil((float)globalThreads_x/tileSize.x), ceil((float)globalThreads_y/tileSize.y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                    0,
                                    handle.GetStream(),
                                    srcPtr,
diff --git a/src/modules/hip/kernel/noise_salt_and_pepper.hpp b/src/modules/hip/kernel/noise_salt_and_pepper.hpp
index bb377417a..6564ed9c9 100644
--- a/src/modules/hip/kernel/noise_salt_and_pepper.hpp
+++ b/src/modules/hip/kernel/noise_salt_and_pepper.hpp
@@ -279,7 +279,7 @@ RppStatus hip_exec_salt_and_pepper_noise_tensor(T *srcPtr,
 
     Rpp32u *xorwowSeedStream;
     xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1];
-    hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice);
+    CHECK_RETURN_STATUS(hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice));
 
     if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
     {
diff --git a/src/modules/hip/kernel/noise_shot.hpp b/src/modules/hip/kernel/noise_shot.hpp
index 77e72d192..a28b0d19f 100644
--- a/src/modules/hip/kernel/noise_shot.hpp
+++ b/src/modules/hip/kernel/noise_shot.hpp
@@ -320,7 +320,7 @@ RppStatus hip_exec_shot_noise_tensor(T *srcPtr,
 
     Rpp32u *xorwowSeedStream;
     xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1];
-    hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice);
+    CHECK_RETURN_STATUS(hipMemcpy(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice));
 
     if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
     {
diff --git a/src/modules/hip/kernel/ricap.hpp b/src/modules/hip/kernel/ricap.hpp
index 795538451..4f72d57d2 100644
--- a/src/modules/hip/kernel/ricap.hpp
+++ b/src/modules/hip/kernel/ricap.hpp
@@ -174,9 +174,6 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr,
     if (roiType == RpptRoiType::LTRB)
         hip_exec_roi_converison_ltrb_to_xywh(roiPtrInputCropRegion, handle);
 
-    int localThreads_x = LOCAL_THREADS_X;
-    int localThreads_y = LOCAL_THREADS_Y;
-    int localThreads_z = LOCAL_THREADS_Z;
     int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
     int globalThreads_y = dstDescPtr->h;
     int globalThreads_z = handle.GetBatchSize();
@@ -184,8 +181,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr,
     if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
     {
         hipLaunchKernelGGL(ricap_pkd_tensor,
-                           dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)),
-                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                            0,
                            handle.GetStream(),
                            srcPtr,
@@ -199,8 +196,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr,
     else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
     {
         hipLaunchKernelGGL(ricap_pln_tensor,
-                           dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)),
-                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                            0,
                            handle.GetStream(),
                            srcPtr,
@@ -217,8 +214,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr,
         if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
         {
             hipLaunchKernelGGL(ricap_pkd3_pln3_tensor,
-                               dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -233,8 +230,8 @@ RppStatus hip_exec_ricap_tensor(T *srcPtr,
         {
             globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
             hipLaunchKernelGGL(ricap_pln3_pkd3_tensor,
-                               dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y), ceil((float)globalThreads_z / localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
diff --git a/src/modules/hip/kernel/spatter.hpp b/src/modules/hip/kernel/spatter.hpp
index 3df78b4c3..ab0831c07 100644
--- a/src/modules/hip/kernel/spatter.hpp
+++ b/src/modules/hip/kernel/spatter.hpp
@@ -243,8 +243,8 @@ RppStatus hip_exec_spatter_tensor(T *srcPtr,
     float *spatterMaskPtr, *spatterMaskInvPtr;
     spatterMaskPtr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
     spatterMaskInvPtr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem + maskSize;
-    hipMemcpy(spatterMaskPtr, spatterMask, maskSizeFloat, hipMemcpyHostToDevice);
-    hipMemcpy(spatterMaskInvPtr, spatterMaskInv, maskSizeFloat, hipMemcpyHostToDevice);
+    CHECK_RETURN_STATUS(hipMemcpy(spatterMaskPtr, spatterMask, maskSizeFloat, hipMemcpyHostToDevice));
+    CHECK_RETURN_STATUS(hipMemcpy(spatterMaskInvPtr, spatterMaskInv, maskSizeFloat, hipMemcpyHostToDevice));
 
     if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
     {
diff --git a/src/modules/hip/kernel/warp_affine.hpp b/src/modules/hip/kernel/warp_affine.hpp
index 8de7a6697..451501e36 100644
--- a/src/modules/hip/kernel/warp_affine.hpp
+++ b/src/modules/hip/kernel/warp_affine.hpp
@@ -330,7 +330,7 @@ RppStatus hip_exec_warp_affine_tensor(T *srcPtr,
     int globalThreads_z = handle.GetBatchSize();
 
     float *affineTensorPtr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
-    hipMemcpy(affineTensorPtr, affineTensor, 6 * handle.GetBatchSize() * sizeof(float), hipMemcpyHostToDevice);
+    CHECK_RETURN_STATUS(hipMemcpy(affineTensorPtr, affineTensor, 6 * handle.GetBatchSize() * sizeof(float), hipMemcpyHostToDevice));
 
     if (interpolationType == RpptInterpolationType::BILINEAR)
     {
diff --git a/src/modules/hip/kernel/water.hpp b/src/modules/hip/kernel/water.hpp
index e496b1670..e808a5ed2 100644
--- a/src/modules/hip/kernel/water.hpp
+++ b/src/modules/hip/kernel/water.hpp
@@ -222,9 +222,6 @@ RppStatus hip_exec_water_tensor(T *srcPtr,
     if (roiType == RpptRoiType::XYWH)
         hip_exec_roi_converison_xywh_to_ltrb(roiTensorPtrSrc, handle);
 
-    int localThreads_x = 16;
-    int localThreads_y = 16;
-    int localThreads_z = 1;
     int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
     int globalThreads_y = dstDescPtr->h;
     int globalThreads_z = handle.GetBatchSize();
@@ -232,8 +229,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr,
     if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
     {
         hipLaunchKernelGGL(water_pkd_tensor,
-                           dim3(ceil(static_cast<float>(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
-                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           dim3(ceil(static_cast<float>(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                            0,
                            handle.GetStream(),
                            srcPtr,
@@ -251,8 +248,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr,
     else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
     {
         hipLaunchKernelGGL(water_pln_tensor,
-                           dim3(ceil(static_cast<float>(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
-                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           dim3(ceil(static_cast<float>(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                            0,
                            handle.GetStream(),
                            srcPtr,
@@ -273,8 +270,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr,
         if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
         {
             hipLaunchKernelGGL(water_pkd3_pln3_tensor,
-                               dim3(ceil(static_cast<float>(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil(static_cast<float>(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
@@ -293,8 +290,8 @@ RppStatus hip_exec_water_tensor(T *srcPtr,
         {
             globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
             hipLaunchKernelGGL(water_pln3_pkd3_tensor,
-                               dim3(ceil(static_cast<float>(globalThreads_x)/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
-                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               dim3(ceil(static_cast<float>(globalThreads_x)/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
                                0,
                                handle.GetStream(),
                                srcPtr,
diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp
index f089b7cde..24cc8e032 100644
--- a/src/modules/rppt_tensor_effects_augmentations.cpp
+++ b/src/modules/rppt_tensor_effects_augmentations.cpp
@@ -895,7 +895,7 @@ RppStatus rppt_spatter_gpu(RppPtr_t srcPtr,
 {
 #ifdef HIP_COMPILE
     RpptROI roiTensorPtrSrcHost[dstDescPtr->n];
-    hipMemcpy(roiTensorPtrSrcHost, roiTensorPtrSrc, dstDescPtr->n * sizeof(RpptROI), hipMemcpyDeviceToHost);
+    CHECK_RETURN_STATUS(hipMemcpy(roiTensorPtrSrcHost, roiTensorPtrSrc, dstDescPtr->n * sizeof(RpptROI), hipMemcpyDeviceToHost));
     if (roiType == RpptRoiType::XYWH)
     {
         for(int i = 0; i < dstDescPtr->n; i++)
@@ -1011,7 +1011,7 @@ RppStatus rppt_salt_and_pepper_noise_gpu(RppPtr_t srcPtr,
 
     RpptXorwowState *d_xorwowInitialStatePtr;
     d_xorwowInitialStatePtr = (RpptXorwowState *) rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
-    hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowState), hipMemcpyHostToDevice);
+    CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowState), hipMemcpyHostToDevice));
 
     if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
     {
@@ -1096,7 +1096,7 @@ RppStatus rppt_shot_noise_gpu(RppPtr_t srcPtr,
 
     RpptXorwowStateBoxMuller *d_xorwowInitialStatePtr;
     d_xorwowInitialStatePtr = (RpptXorwowStateBoxMuller *) rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
-    hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice);
+    CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice));
 
     if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
     {
@@ -1451,9 +1451,8 @@ RppStatus rppt_ricap_gpu(RppPtr_t srcPtr,
 #ifdef HIP_COMPILE
     if(srcDescPtr->n == 1) // BatchSize should always be greater than 1
         return RPP_ERROR;
-    Rpp32u* permutationHipTensor;
-    hipMalloc(&permutationHipTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n);
-    hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice);
+    Rpp32u *permutationHipTensor = reinterpret_cast<Rpp32u*>(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+    CHECK_RETURN_STATUS(hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice));
 
     if ((check_roi_out_of_bounds(&roiPtrInputCropRegion[0],srcDescPtr,roiType) == -1)
     || (check_roi_out_of_bounds(&roiPtrInputCropRegion[1],srcDescPtr,roiType) == -1)
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 1ee8dd3b4..23c3f19fe 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -250,12 +250,12 @@ int main(int argc, char **argv)
 
     // Initialize ROI tensors for src/dst
     RpptROI *roiTensorPtrSrc, *roiTensorPtrDst;
-    CHECK(hipHostMalloc(&roiTensorPtrSrc, batchSize * sizeof(RpptROI)));
-    CHECK(hipHostMalloc(&roiTensorPtrDst, batchSize * sizeof(RpptROI)));
+    CHECK_RETURN_STATUS(hipHostMalloc(&roiTensorPtrSrc, batchSize * sizeof(RpptROI)));
+    CHECK_RETURN_STATUS(hipHostMalloc(&roiTensorPtrDst, batchSize * sizeof(RpptROI)));
 
     // Initialize the ImagePatch for dst
     RpptImagePatch *dstImgSizes;
-    CHECK(hipHostMalloc(&dstImgSizes, batchSize * sizeof(RpptImagePatch)));
+    CHECK_RETURN_STATUS(hipHostMalloc(&dstImgSizes, batchSize * sizeof(RpptImagePatch)));
 
     // Set ROI tensors types for src/dst
     RpptRoiType roiTypeSrc, roiTypeDst;
@@ -315,7 +315,7 @@ int main(int argc, char **argv)
     // Run case-wise RPP API and measure time
     rppHandle_t handle;
     hipStream_t stream;
-    CHECK(hipStreamCreate(&stream));
+    CHECK_RETURN_STATUS(hipStreamCreate(&stream));
     rppCreateWithStreamAndBatchSize(&handle, stream, batchSize);
 
     int noOfIterations = (int)imageNames.size() / batchSize;
@@ -333,30 +333,30 @@ int main(int argc, char **argv)
             bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
         else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
             bitDepthByteSize = sizeof(Rpp32f);  // using 32f outputs for 16f and 32f
-        CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
+        CHECK_RETURN_STATUS(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
     }
 
     // Allocate hip memory for src/dst
-    CHECK(hipMalloc(&d_input, inputBufferSize));
-    CHECK(hipMalloc(&d_output, outputBufferSize));
+    CHECK_RETURN_STATUS(hipMalloc(&d_input, inputBufferSize));
+    CHECK_RETURN_STATUS(hipMalloc(&d_output, outputBufferSize));
     if(dualInputCase)
-        CHECK(hipMalloc(&d_input_second, inputBufferSize));
+        CHECK_RETURN_STATUS(hipMalloc(&d_input_second, inputBufferSize));
 
     RpptROI *roiPtrInputCropRegion;
     if(testCase == 82)
-        CHECK(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI)));
+        CHECK_RETURN_STATUS(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI)));
 
     RpptROI *cropRoi, *patchRoi;
     if(testCase == 33)
     {
-        CHECK(hipHostMalloc(&cropRoi, batchSize * sizeof(RpptROI)));
-        CHECK(hipHostMalloc(&patchRoi, batchSize * sizeof(RpptROI)));
+        CHECK_RETURN_STATUS(hipHostMalloc(&cropRoi, batchSize * sizeof(RpptROI)));
+        CHECK_RETURN_STATUS(hipHostMalloc(&patchRoi, batchSize * sizeof(RpptROI)));
     }
     bool invalidROI = (roiList[0] == 0 && roiList[1] == 0 && roiList[2] == 0 && roiList[3] == 0);
 
     Rpp32f *intensity;
     if(testCase == 46)
-        CHECK(hipHostMalloc(&intensity, batchSize * sizeof(Rpp32f)));
+        CHECK_RETURN_STATUS(hipHostMalloc(&intensity, batchSize * sizeof(Rpp32f)));
 
     // case-wise RPP API and measure time script for Unit and Performance test
     printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize);
@@ -396,10 +396,10 @@ int main(int argc, char **argv)
         convert_input_bitdepth(input, input_second, inputu8, inputu8Second, inputBitDepth, ioBufferSize, inputBufferSize, srcDescPtr, dualInputCase, conversionFactor);
 
         //copy decoded inputs to hip buffers
-        CHECK(hipMemcpy(d_input, input, inputBufferSize, hipMemcpyHostToDevice));
-        CHECK(hipMemcpy(d_output, output, outputBufferSize, hipMemcpyHostToDevice));
+        CHECK_RETURN_STATUS(hipMemcpy(d_input, input, inputBufferSize, hipMemcpyHostToDevice));
+        CHECK_RETURN_STATUS(hipMemcpy(d_output, output, outputBufferSize, hipMemcpyHostToDevice));
         if(dualInputCase)
-            CHECK(hipMemcpy(d_input_second, input_second, inputBufferSize, hipMemcpyHostToDevice));
+            CHECK_RETURN_STATUS(hipMemcpy(d_input_second, input_second, inputBufferSize, hipMemcpyHostToDevice));
 
         int roiHeightList[batchSize], roiWidthList[batchSize];
         if(invalidROI)
@@ -677,8 +677,8 @@ int main(int argc, char **argv)
                     testCaseName = "lut";
 
                     Rpp32f *lutBuffer;
-                    CHECK(hipHostMalloc(&lutBuffer, 65536 * sizeof(Rpp32f)));
-                    CHECK(hipMemset(lutBuffer, 0, 65536 * sizeof(Rpp32f)));
+                    CHECK_RETURN_STATUS(hipHostMalloc(&lutBuffer, 65536 * sizeof(Rpp32f)));
+                    CHECK_RETURN_STATUS(hipMemset(lutBuffer, 0, 65536 * sizeof(Rpp32f)));
                     Rpp8u *lut8u = reinterpret_cast<Rpp8u *>(lutBuffer);
                     Rpp16f *lut16f = reinterpret_cast<Rpp16f *>(lutBuffer);
                     Rpp32f *lut32f = reinterpret_cast<Rpp32f *>(lutBuffer);
@@ -710,7 +710,7 @@ int main(int argc, char **argv)
 
                     break;
 
-                    CHECK(hipHostFree(lutBuffer));
+                    CHECK_RETURN_STATUS(hipHostFree(lutBuffer));
                 }
                 case 36:
                 {
@@ -1140,7 +1140,7 @@ int main(int argc, char **argv)
                     break;
             }
 
-            CHECK(hipDeviceSynchronize());
+            CHECK_RETURN_STATUS(hipDeviceSynchronize());
             endWallTime = omp_get_wtime();
             wallTime = endWallTime - startWallTime;
             if (missingFuncFlag == 1)
@@ -1215,7 +1215,7 @@ int main(int argc, char **argv)
             }
             else
             {
-                CHECK(hipMemcpy(output, d_output, outputBufferSize, hipMemcpyDeviceToHost));
+                CHECK_RETURN_STATUS(hipMemcpy(output, d_output, outputBufferSize, hipMemcpyDeviceToHost));
 
                 // Reconvert other bit depths to 8u for output display purposes
                 convert_output_bitdepth_to_u8(output, outputu8, inputBitDepth, oBufferSize, outputBufferSize, dstDescPtr, invConversionFactor);
@@ -1278,29 +1278,29 @@ int main(int argc, char **argv)
     }
 
     // Free memory
-    CHECK(hipHostFree(roiTensorPtrSrc));
-    CHECK(hipHostFree(roiTensorPtrDst));
-    CHECK(hipHostFree(dstImgSizes));
+    CHECK_RETURN_STATUS(hipHostFree(roiTensorPtrSrc));
+    CHECK_RETURN_STATUS(hipHostFree(roiTensorPtrDst));
+    CHECK_RETURN_STATUS(hipHostFree(dstImgSizes));
     if(testCase == 46)
-        CHECK(hipHostFree(intensity));
+        CHECK_RETURN_STATUS(hipHostFree(intensity));
     if(testCase == 82)
-        CHECK(hipHostFree(roiPtrInputCropRegion));
+        CHECK_RETURN_STATUS(hipHostFree(roiPtrInputCropRegion));
     if(testCase == 33)
     {
-        CHECK(hipHostFree(cropRoi));
-        CHECK(hipHostFree(patchRoi));
+        CHECK_RETURN_STATUS(hipHostFree(cropRoi));
+        CHECK_RETURN_STATUS(hipHostFree(patchRoi));
     }
     if (reductionTypeCase)
-        CHECK(hipHostFree(reductionFuncResultArr));
+        CHECK_RETURN_STATUS(hipHostFree(reductionFuncResultArr));
     free(input);
     free(input_second);
     free(output);
     free(inputu8);
     free(inputu8Second);
     free(outputu8);
-    CHECK(hipFree(d_input));
+    CHECK_RETURN_STATUS(hipFree(d_input));
     if(dualInputCase)
-        CHECK(hipFree(d_input_second));
-    CHECK(hipFree(d_output));
+        CHECK_RETURN_STATUS(hipFree(d_input_second));
+    CHECK_RETURN_STATUS(hipFree(d_output));
     return 0;
 }
diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
index 5834e4307..b39f461cc 100644
--- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
@@ -122,7 +122,7 @@ int main(int argc, char * argv[])
 
     // set src/dst xyzwhd ROI tensors
     void *pinnedMemROI;
-    CHECK(hipHostMalloc(&pinnedMemROI, noOfFiles * sizeof(RpptROI3D)));
+    CHECK_RETURN_STATUS(hipHostMalloc(&pinnedMemROI, noOfFiles * sizeof(RpptROI3D)));
     RpptROI3D *roiGenericSrcPtr = reinterpret_cast<RpptROI3D *>(pinnedMemROI);
 
     // Set buffer sizes in pixels for src/dst
@@ -139,16 +139,16 @@ int main(int argc, char * argv[])
 
     // Allocate hip memory in float for RPP strided buffer
     void *d_inputF32, *d_outputF32;
-    CHECK(hipMalloc(&d_inputF32, iBufferSizeInBytes));
-    CHECK(hipMalloc(&d_outputF32, oBufferSizeInBytes));
+    CHECK_RETURN_STATUS(hipMalloc(&d_inputF32, iBufferSizeInBytes));
+    CHECK_RETURN_STATUS(hipMalloc(&d_outputF32, oBufferSizeInBytes));
 
     // set argument tensors
     void *pinnedMemArgs;
-    CHECK(hipHostMalloc(&pinnedMemArgs, 2 * noOfFiles * sizeof(Rpp32f)));
+    CHECK_RETURN_STATUS(hipHostMalloc(&pinnedMemArgs, 2 * noOfFiles * sizeof(Rpp32f)));
 
     rppHandle_t handle;
     hipStream_t stream;
-    CHECK(hipStreamCreate(&stream));
+    CHECK_RETURN_STATUS(hipStreamCreate(&stream));
     rppCreateWithStreamAndBatchSize(&handle, stream, batchSize);
 
     // Run case-wise RPP API and measure time
@@ -166,8 +166,8 @@ int main(int argc, char * argv[])
         inputU8 = static_cast<Rpp8u *>(calloc(iBufferSizeU8, 1));
         outputU8 = static_cast<Rpp8u *>(calloc(iBufferSizeU8, 1));
 
-        CHECK(hipMalloc(&d_inputU8, iBufferSizeU8));
-        CHECK(hipMalloc(&d_outputU8, iBufferSizeU8));
+        CHECK_RETURN_STATUS(hipMalloc(&d_inputU8, iBufferSizeU8));
+        CHECK_RETURN_STATUS(hipMalloc(&d_outputU8, iBufferSizeU8));
     }
 
     printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", funcName.c_str(), numRuns, batchSize);
@@ -220,11 +220,11 @@ int main(int argc, char * argv[])
         {
             for(int i = 0; i < iBufferSizeU8; i++)
                 inputU8[i] = std::min(std::max(static_cast<unsigned char>(inputF32[i]), static_cast<unsigned char>(0)), static_cast<unsigned char>(255));
-            CHECK(hipMemcpy(d_inputU8, inputU8, iBufferSizeU8, hipMemcpyHostToDevice));
+            CHECK_RETURN_STATUS(hipMemcpy(d_inputU8, inputU8, iBufferSizeU8, hipMemcpyHostToDevice));
         }
 
         //Copy input buffer to hip
-        CHECK(hipMemcpy(d_inputF32, inputF32, iBufferSizeInBytes, hipMemcpyHostToDevice));
+        CHECK_RETURN_STATUS(hipMemcpy(d_inputF32, inputF32, iBufferSizeInBytes, hipMemcpyHostToDevice));
 
         for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++)
         {
@@ -367,7 +367,7 @@ int main(int argc, char * argv[])
                 }
             }
 
-            CHECK(hipDeviceSynchronize());
+            CHECK_RETURN_STATUS(hipDeviceSynchronize());
             endWallTime = omp_get_wtime();
             wallTime = endWallTime - startWallTime;
             maxWallTime = std::max(maxWallTime, wallTime);
@@ -383,7 +383,7 @@ int main(int argc, char * argv[])
         }
 
         // Copy output buffer to host
-        CHECK(hipMemcpy(outputF32, d_outputF32, oBufferSizeInBytes, hipMemcpyDeviceToHost));
+        CHECK_RETURN_STATUS(hipMemcpy(outputF32, d_outputF32, oBufferSizeInBytes, hipMemcpyDeviceToHost));
         if(testType == 0)
         {
             cout << "\n\nGPU Backend Wall Time: " << wallTime <<" ms per batch"<< endl;
@@ -406,7 +406,7 @@ int main(int argc, char * argv[])
             if(inputBitDepth == 0)
             {
                 Rpp64u bufferLength = iBufferSize * sizeof(Rpp8u) + descriptorPtr3D->offsetInBytes;
-                CHECK(hipMemcpy(outputU8, d_outputU8, bufferLength, hipMemcpyDeviceToHost));
+                CHECK_RETURN_STATUS(hipMemcpy(outputU8, d_outputU8, bufferLength, hipMemcpyDeviceToHost));
 
                 // Copy U8 buffer to F32 buffer for display purposes
                 for(int i = 0; i < bufferLength; i++)
@@ -498,10 +498,10 @@ int main(int argc, char * argv[])
     free(niftiDataArray);
     free(inputF32);
     free(outputF32);
-    CHECK(hipHostFree(pinnedMemROI));
-    CHECK(hipHostFree(pinnedMemArgs));
-    CHECK(hipFree(d_inputF32));
-    CHECK(hipFree(d_outputF32));
+    CHECK_RETURN_STATUS(hipHostFree(pinnedMemROI));
+    CHECK_RETURN_STATUS(hipHostFree(pinnedMemArgs));
+    CHECK_RETURN_STATUS(hipFree(d_inputF32));
+    CHECK_RETURN_STATUS(hipFree(d_outputF32));
     if(inputBitDepth == 0)
     {
         if(inputU8 != NULL)
@@ -509,9 +509,9 @@ int main(int argc, char * argv[])
         if(outputU8 != NULL)
             free(outputU8);
         if(d_inputU8 != NULL)
-            CHECK(hipFree(d_inputU8));
+            CHECK_RETURN_STATUS(hipFree(d_inputU8));
         if(d_outputU8 != NULL)
-            CHECK(hipFree(d_outputU8));
+            CHECK_RETURN_STATUS(hipFree(d_outputU8));
     }
 
     return(0);
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index 8360a846c..3ec123d7f 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -59,12 +59,12 @@ using namespace std;
 #define GOLDEN_OUTPUT_MAX_HEIGHT 150    // Golden outputs are generated with MAX_HEIGHT set to 150. Changing this constant will result in QA test failures
 #define GOLDEN_OUTPUT_MAX_WIDTH 150     // Golden outputs are generated with MAX_WIDTH set to 150. Changing this constant will result in QA test failures
 
-#define CHECK(x) do { \
-  int retval = (x); \
-  if (retval != 0) { \
-    fprintf(stderr, "Runtime error: %s returned %d at %s:%d", #x, retval, __FILE__, __LINE__); \
-    exit(-1); \
-  } \
+#define CHECK_RETURN_STATUS(x) do { \
+    int retval = (x); \
+    if (retval != 0) { \
+        fprintf(stderr, "Runtime error: %s returned %d at %s:%d", #x, retval, __FILE__, __LINE__); \
+        exit(-1); \
+    } \
 } while (0)
 
 std::map<int, string> augmentationMap =