Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RPP Audio Support HIP - To Decibels #398

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
73 commits
Select commit Hold shift + click to select a range
c33af22
Bump rocm-docs-core[api_reference] from 0.35.0 to 0.35.1 in /docs/sph…
dependabot[bot] Mar 6, 2024
14f6334
Bump rocm-docs-core[api_reference] from 0.35.1 to 0.36.0 in /docs/sph…
dependabot[bot] Mar 12, 2024
95c3272
Merge branch 'master' into develop
kiritigowda Mar 12, 2024
3973c34
added api support for ToDecibels HIP kernel
sampath1117 Mar 19, 2024
3f08f90
added test suite support for audio in HIP
sampath1117 Mar 8, 2024
b9c0788
added profiler support for hip test suite
sampath1117 Mar 18, 2024
ab9de97
added initial max find support for 1D and 2D data
sampath1117 Mar 19, 2024
ffd7817
added initial support for todecibels kernel
sampath1117 Mar 20, 2024
641f653
Docs - Bump rocm-docs-core[api_reference] from 0.36.0 to 0.37.0 in /d…
dependabot[bot] Mar 20, 2024
5568573
Link cleanup (#326)
LisaDelaney Mar 20, 2024
a6749ba
Update notes
LisaDelaney Mar 20, 2024
8a98556
improved precision match to 6 decimals
sampath1117 Mar 21, 2024
089a347
added kernal launch configuration for max kernel
sampath1117 Mar 21, 2024
a255906
Docs - Bump rocm-docs-core[api_reference] from 0.37.0 to 0.37.1 in /d…
dependabot[bot] Mar 22, 2024
d3df761
RPP Voxel Flip on HIP and HOST (#285)
r-abishek Mar 23, 2024
ebecb42
RPP Vignette Tensor on HOST and HIP (#311)
r-abishek Mar 23, 2024
5ff01c9
remove empty blank line
sampath1117 Mar 25, 2024
fc1410b
Bump rocm-docs-core[api_reference] from 0.37.1 to 0.38.0 in /docs/sph…
dependabot[bot] Mar 27, 2024
c0f0d34
Merge branch 'develop' into sr/to_decibels_hip
sampath1117 Mar 29, 2024
5741b66
changed 8 pixel load and store to single pixel load and store for 2d …
sampath1117 Mar 29, 2024
3ebd7c3
RPP Tensor Audio Support - Resample (#310)
r-abishek Apr 3, 2024
76f31df
Docs - Missing input and output images for Doxygen (#331)
r-abishek Apr 3, 2024
b83f910
Scratch buffers rename for HOST and HIP (#324)
r-abishek Apr 3, 2024
ebeb131
Update CMakeLists.txt
kiritigowda Apr 3, 2024
2c160cd
Merge branch 'develop' into sr/to_decibels_hip
sampath1117 Apr 8, 2024
14538d2
added missing hipDeviceSynchronize() in test suite
sampath1117 Apr 8, 2024
574d42d
minor bug fix
sampath1117 Apr 8, 2024
fe37ae1
removed f16 includes since not needed for audio
sampath1117 Apr 4, 2024
179253d
restructured python test suite
sampath1117 Apr 4, 2024
6a99788
added empty line at EOF
sampath1117 Apr 8, 2024
68b70a5
fixed spacing in Doxygen
sampath1117 Apr 11, 2024
1147bfe
Update CMakeLists.txt
kiritigowda Apr 12, 2024
ccd493d
Merge remote-tracking branch 'develop' into sr/to_decibels_hip
sampath1117 Apr 16, 2024
5e3fc7a
Bump rocm-docs-core[api_reference] from 0.38.1 to 1.0.0 in /docs/sphi…
dependabot[bot] Apr 18, 2024
b6b7cc5
Bump rocm-docs-core[api_reference] from 1.0.0 to 1.1.0 in /docs/sphin…
dependabot[bot] Apr 25, 2024
e16ad7a
RPP Gaussian Noise Voxel Tensor on HOST and HIP (#323)
r-abishek Apr 26, 2024
7d34794
Merge branch 'develop' into sr/to_decibels_hip
sampath1117 Apr 30, 2024
d2f5d8e
modify CHECK to CHECK_RETURN_STATUS
sampath1117 Apr 30, 2024
b30f50b
Merge branch 'develop' into sr/to_decibels_hip
sampath1117 May 2, 2024
77e14ef
Minor common-fixes for HIP (#345)
r-abishek May 7, 2024
34f3f6d
Readme Updates: --usecase=rocm (#349)
kiritigowda May 8, 2024
ab52683
RPP Tensor Audio Support - Spectrogram (#312)
r-abishek May 8, 2024
ee0d6fe
Update CHANGELOG.md (#352)
r-abishek May 8, 2024
2decd32
RPP Tensor Audio Support - Slice (#325)
r-abishek May 8, 2024
30ce1d6
RPP Tensor Audio Support - MelFilterBank (#332)
r-abishek May 8, 2024
64ae74f
RPP Tensor Normalize ND on HOST and HIP (#335)
r-abishek May 9, 2024
1a3015c
SWDEV-459739 - Remove the package obsolete setting (#353)
raramakr May 9, 2024
bcc9ac9
Merge branch 'develop' into sr/to_decibels_hip
sampath1117 May 9, 2024
4cb8d4b
Audio support merge commit fixes (#354)
r-abishek May 9, 2024
7349eab
remove min_smem from comments
sampath1117 May 16, 2024
7cd32bd
modified verify_output to have different cutoff for HIP and HOST back…
sampath1117 May 15, 2024
f78e93c
separated reduction kernels for 1d and 2d
sampath1117 May 16, 2024
a086a88
reorganized kernels
sampath1117 May 16, 2024
a601ddc
Merge branch 'develop' into sr/to_decibels_hip
sampath1117 May 16, 2024
b29bfe6
addded empty line at EOF
sampath1117 May 17, 2024
fb53f63
add more comments for 1D reduction kernel
sampath1117 May 17, 2024
63235b4
removed handle floatArr[0] usage in hip kernel
sampath1117 May 17, 2024
8efaa3e
added more comments
sampath1117 May 17, 2024
8732386
renamed tensor_hip_audio to tensor_audio_hip
sampath1117 May 17, 2024
33e5f15
make device helper as inline
sampath1117 May 19, 2024
0cc5c20
Merge branch 'develop' into sr/to_decibels_hip
sampath1117 May 30, 2024
42c1c9f
Merge branch 'develop' into sr/to_decibels_hip
r-abishek Jul 10, 2024
305e138
Merge pull request #252 from sampath1117/sr/to_decibels_hip
r-abishek Jul 10, 2024
274ca8c
Merge branch 'develop' into ar/audio_support_2_to_decibels_hip
r-abishek Jul 16, 2024
6e29926
audio test suite changes for python 2 compatibility
sampath1117 Jul 17, 2024
a69f90e
added validation checks for numDims
sampath1117 Jul 17, 2024
953fdcb
Merge pull request #294 from sampath1117/sr/to_decibels_hip_pr_changes
r-abishek Jul 17, 2024
f650fee
Merge branch 'develop' into ar/audio_support_2_to_decibels_hip
kiritigowda Jul 22, 2024
7d6b240
Merge branch 'develop' into ar/audio_support_2_to_decibels_hip
kiritigowda Jul 23, 2024
eebb7b1
Merge branch 'develop' into sr/to_decibels_merge_develop
sampath1117 Jul 25, 2024
8c31674
removed duplicate code added in merge
sampath1117 Jul 25, 2024
0361342
Merge pull request #297 from sampath1117/sr/to_decibels_merge_develop
r-abishek Jul 25, 2024
ee8d034
Merge branch 'develop' into ar/audio_support_2_to_decibels_hip
kiritigowda Aug 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/rppdefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ typedef enum
RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE = -21,
/*! \brief Scratch memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */
RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE = -22,
/*! \brief Number of src dims is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
RPP_ERROR_INVALID_SRC_DIMS = -23
} RppStatus;

/*! \brief RPP rppStatus_t type enums
Expand Down
25 changes: 22 additions & 3 deletions include/rppt_tensor_audio_augmentations.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,11 @@ RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, RpptDescPtr srcD
#endif // GPU_SUPPORT

/*! \brief To Decibels augmentation on HOST backend
* \details To Decibels augmentation for 1D audio buffer converts magnitude values to decibel values
* \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values
* \param [in] srcPtr source tensor in HOST memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
* \param [in] srcDims source tensor sizes for each element in batch (2D tensor in HOST memory, of size batchSize * 2)
* \param [in] cutOffDB minimum or cut-off ratio in dB
* \param [in] multiplier factor by which the logarithm is multiplied
Expand All @@ -103,6 +103,25 @@ RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, RpptDescPtr srcD
*/
RppStatus rppt_to_decibels_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr srcDims, Rpp32f cutOffDB, Rpp32f multiplier, Rpp32f referenceMagnitude, rppHandle_t rppHandle);

#ifdef GPU_SUPPORT
/*! \brief To Decibels augmentation on HIP backend
* \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
* \param [in] srcDims source tensor sizes for each element in batch (2D tensor in Pinned/HIP memory, of size batchSize * 2)
* \param [in] cutOffDB minimum or cut-off ratio in dB
* \param [in] multiplier factor by which the logarithm is multiplied
* \param [in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference
* \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr srcDims, Rpp32f cutOffDB, Rpp32f multiplier, Rpp32f referenceMagnitude, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! \brief Pre Emphasis Filter augmentation on HOST backend
* \details Pre Emphasis Filter augmentation for audio data
* \param [in] srcPtr source tensor in HOST memory
Expand Down
1 change: 1 addition & 0 deletions src/modules/hip/hip_tensor_audio_augmentations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@ SOFTWARE.
#define HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP

#include "kernel/non_silent_region_detection.hpp"
#include "kernel/to_decibels.hpp"

#endif // HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
312 changes: 312 additions & 0 deletions src/modules/hip/kernel/to_decibels.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,312 @@
#include <hip/hip_runtime.h>
#include "rpp_hip_common.hpp"

// -------------------- Set 0 - to_decibels device helpers --------------------

__device__ __forceinline__ void to_decibels_hip_compute(d_float8 *src_f8, d_float8 *dst_f8, double minRatio, float multiplier, float inverseMagnitude)
{
dst_f8->f1[0] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[0]) * inverseMagnitude)));
dst_f8->f1[1] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[1]) * inverseMagnitude)));
dst_f8->f1[2] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[2]) * inverseMagnitude)));
dst_f8->f1[3] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[3]) * inverseMagnitude)));
dst_f8->f1[4] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[4]) * inverseMagnitude)));
dst_f8->f1[5] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[5]) * inverseMagnitude)));
dst_f8->f1[6] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[6]) * inverseMagnitude)));
dst_f8->f1[7] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[7]) * inverseMagnitude)));
}

// -------------------- Set 1 - kernels for finding inverse magnitude value --------------------

__global__ void inverse_magnitude_hip_tensor(float *srcPtr,
int maxLength,
bool computeMax,
float *inverseMagnitudeTensor)

{
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

// Do final reduction on block wise max
if (computeMax)
{
uint srcIdx = id_z * maxLength;
__shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block
max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads

if (id_x >= maxLength)
return;

srcIdx += id_x;
float maxVal = srcPtr[srcIdx];
while (id_x < maxLength)
{
maxVal = fmaxf(maxVal, srcPtr[srcIdx]);
id_x += hipBlockDim_x;
srcIdx += hipBlockDim_x;
}
max_smem[hipThreadIdx_x] = maxVal;
__syncthreads(); // syncthreads after max compute

// Reduction of 256 floats on 256 threads per block in x dimension
for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
{
if (hipThreadIdx_x < threadMax)
max_smem[hipThreadIdx_x] = max(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
__syncthreads();
}

// Final store to dst
if (hipThreadIdx_x == 0)
inverseMagnitudeTensor[id_z] = 1.f / max_smem[0];
}
else
{
inverseMagnitudeTensor[id_z] = 1.0f;
}
}

__global__ void max_reduction_1d_hip_tensor(float *srcPtr,
uint2 srcStridesNH,
RpptImagePatchPtr srcDims,
float *maxArr)
{
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;

uint srcLength = srcDims[id_z].height;
uint srcIdx = id_z * srcStridesNH.x;
__shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block
max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads

if (id_x >= srcLength)
return;

srcIdx += id_x;
d_float8 src_f8;
rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
rpp_hip_math_max8(&src_f8, &max_smem[hipThreadIdx_x]);
__syncthreads(); // syncthreads after max compute

// Reduction of 256 floats on 256 threads per block in x dimension
for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
{
if (hipThreadIdx_x < threadMax)
max_smem[hipThreadIdx_x] = fmaxf(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
__syncthreads();
}

// Final store to dst
if (hipThreadIdx_x == 0)
maxArr[id_z * hipGridDim_x + hipBlockIdx_x] = max_smem[0];
}

__global__ void max_reduction_2d_hip_tensor(float *srcPtr,
uint2 srcStridesNH,
RpptImagePatchPtr srcDims,
float *maxArr)
{
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

__shared__ float partialMax_smem[16][16]; // 16 rows of src, 16 reduced cols of src in a 16 x 16 thread block
uint srcIdx = (id_z * srcStridesNH.x);
float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads

if ((id_y >= srcDims[id_z].height) || (id_x >= srcDims[id_z].width))
return;

srcIdx += ((id_y * srcStridesNH.y) + id_x);
partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];
__syncthreads(); // syncthreads

// Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
{
if (hipThreadIdx_x < threadMax)
partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
__syncthreads();
}

if (hipThreadIdx_x == 0)
{
// Reduction of 16 floats on 16 threads per block in y dimension
for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
{
if (hipThreadIdx_y < threadMax)
partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]);
__syncthreads();
}

// Final store to dst
if (hipThreadIdx_y == 0)
maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0];
}
}

// -------------------- Set 2 - to decibels kernels --------------------

__global__ void to_decibels_1d_hip_tensor(float *srcPtr,
uint srcStride,
float *dstPtr,
uint dstStride,
RpptImagePatchPtr srcDims,
double minRatio,
float multiplier,
float *inverseMagnitudeTensor)
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

if (id_x >= srcDims[id_z].height)
return;

uint srcIdx = (id_z * srcStride) + id_x;
float inverseMagnitude = inverseMagnitudeTensor[id_z];

d_float8 src_f8, dst_f8;
rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
to_decibels_hip_compute(&src_f8, &dst_f8, minRatio, multiplier, inverseMagnitude);

uint dstIdx = (id_z * dstStride) + id_x;
rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
}

__global__ void to_decibels_2d_hip_tensor(float *srcPtr,
uint2 srcStridesNH,
float *dstPtr,
uint2 dstStridesNH,
RpptImagePatchPtr srcDims,
double minRatio,
float multiplier,
float *inverseMagnitudeTensor)
{
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

if (id_x >= srcDims[id_z].width || id_y >= srcDims[id_z].height)
return;

uint srcIdx = (id_z * srcStridesNH.x) + (id_y * srcStridesNH.y) + id_x;
uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
float inverseMagnitude = inverseMagnitudeTensor[id_z];
dstPtr[dstIdx] = multiplier * log2(max(minRatio, (static_cast<double>(srcPtr[srcIdx]) * inverseMagnitude)));
}

// -------------------- Set 3 - to decibels kernels executor --------------------

RppStatus hip_exec_to_decibels_tensor(Rpp32f *srcPtr,
RpptDescPtr srcDescPtr,
Rpp32f *dstPtr,
RpptDescPtr dstDescPtr,
RpptImagePatchPtr srcDims,
Rpp32f cutOffDB,
Rpp32f multiplier,
Rpp32f referenceMagnitude,
rpp::Handle& handle)
{
Rpp32u numDims = srcDescPtr->numDims - 1; // exclude batchSize from input dims

// Calculate the intermediate values needed for DB conversion
Rpp32f minRatio = std::pow(10, cutOffDB / multiplier);
if(!minRatio)
minRatio = std::nextafter(0.0f, 1.0f);
const Rpp32f log10Factor = 0.3010299956639812; //1 / std::log(10);
multiplier *= log10Factor;

// calculate max in input if referenceMagnitude = 0
Rpp32f *partialMaxArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
Rpp32s numBlocksPerSample = 0;
Rpp32s globalThreads_z = dstDescPtr->n;

// find the invReferenceMagnitude value
bool computeMax = (!referenceMagnitude);
if(computeMax)
{
if (numDims == 1)
{
numBlocksPerSample = ceil(static_cast<Rpp32f>((srcDescPtr->strides.nStride + 7) >> 3) / LOCAL_THREADS_X_1DIM);
hipLaunchKernelGGL(max_reduction_1d_hip_tensor,
dim3(numBlocksPerSample, 1, globalThreads_z),
dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
0,
handle.GetStream(),
srcPtr,
make_uint2(srcDescPtr->strides.nStride, 1),
srcDims,
partialMaxArr);
}
else if (numDims == 2)
{
Rpp32s gridDim_x = ceil(static_cast<Rpp32f>((srcDescPtr->strides.hStride)/LOCAL_THREADS_X));
Rpp32s gridDim_y = ceil(static_cast<Rpp32f>(srcDescPtr->h)/LOCAL_THREADS_Y);
Rpp32s gridDim_z = ceil(static_cast<Rpp32f>(globalThreads_z)/LOCAL_THREADS_Z);
numBlocksPerSample = gridDim_x * gridDim_y * gridDim_z;
hipLaunchKernelGGL(max_reduction_2d_hip_tensor,
dim3(gridDim_x, gridDim_y, gridDim_z),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
srcDims,
partialMaxArr);
}
hipStreamSynchronize(handle.GetStream());
}
Rpp32u blockSize = (computeMax) ? 256: 1;
Rpp32f *inverseMagnitudeTensor = partialMaxArr + globalThreads_z * numBlocksPerSample;
hipLaunchKernelGGL(inverse_magnitude_hip_tensor,
dim3(1, 1, globalThreads_z),
dim3(blockSize, 1, 1),
0,
handle.GetStream(),
partialMaxArr,
numBlocksPerSample,
computeMax,
inverseMagnitudeTensor);
hipStreamSynchronize(handle.GetStream());

// launch kernel for todecibels
if (numDims == 1)
{
Rpp32s globalThreads_x = (srcDescPtr->strides.nStride + 7) >> 3;
Rpp32s globalThreads_y = 1;
hipLaunchKernelGGL(to_decibels_1d_hip_tensor,
dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
0,
handle.GetStream(),
srcPtr,
srcDescPtr->strides.nStride,
dstPtr,
dstDescPtr->strides.nStride,
srcDims,
static_cast<double>(minRatio),
multiplier,
inverseMagnitudeTensor);
}
else if (numDims == 2)
{
Rpp32s globalThreads_x = srcDescPtr->strides.hStride;
Rpp32s globalThreads_y = srcDescPtr->h;
hipLaunchKernelGGL(to_decibels_2d_hip_tensor,
dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
0,
handle.GetStream(),
srcPtr,
make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
dstPtr,
make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
srcDims,
static_cast<double>(minRatio),
multiplier,
inverseMagnitudeTensor);
}

return RPP_SUCCESS;
}
Loading