diff --git a/CHANGELOG.md b/CHANGELOG.md index b8fadf597..f2067d8cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/rpp/en/latest](https://rocm.docs.amd.com/projects/rpp/en/latest) +## RPP 1.9.2 (Unreleased) + +### Changes + +RPP Audio Support HIP - Mel Filter Bank + ## RPP 1.9.1 for ROCm 6.3.0 ### Changes diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b0ba4024..322f3cd73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,7 +29,7 @@ endif() set(CMAKE_CXX_STANDARD 17) # RPP Version -set(VERSION "1.9.1") +set(VERSION "1.9.2") # Set Project Version and Language project(rpp VERSION ${VERSION} LANGUAGES CXX) diff --git a/include/rpp_version.h b/include/rpp_version.h index 79e0b248d..448a0d1ac 100644 --- a/include/rpp_version.h +++ b/include/rpp_version.h @@ -40,7 +40,7 @@ extern "C" { // NOTE: IMPORTANT: Match the version with CMakelists.txt version #define RPP_VERSION_MAJOR 1 #define RPP_VERSION_MINOR 9 -#define RPP_VERSION_PATCH 1 +#define RPP_VERSION_PATCH 2 #ifdef __cplusplus } #endif diff --git a/include/rppdefs.h b/include/rppdefs.h index e8d908ccb..b99717c9c 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -73,7 +73,7 @@ SOFTWARE. const float ONE_OVER_6 = 1.0f / 6; const float ONE_OVER_3 = 1.0f / 3; const float ONE_OVER_255 = 1.0f / 255; -const uint MMS_MAX_SCRATCH_MEMORY = 76800000; // maximum scratch memory size (number of floats) needed for MMS buffer in RNNT training +const uint MMS_MAX_SCRATCH_MEMORY = 115293120; // maximum scratch memory size (number of floats) needed for MMS buffer in RNNT training /******************** RPP typedefs ********************/ @@ -154,7 +154,9 @@ typedef enum /*! \brief Scratch memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */ RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE = -22, /*! \brief Number of src dims is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */ - RPP_ERROR_INVALID_SRC_DIMS = -23 + RPP_ERROR_INVALID_SRC_DIMS = -23, + /*! \brief Number of dst dims is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */ + RPP_ERROR_INVALID_DST_DIMS = -24 } RppStatus; /*! \brief RPP rppStatus_t type enums @@ -748,6 +750,67 @@ typedef struct RpptResamplingWindow __m128 pCenter, pScale; } RpptResamplingWindow; +/*! \brief Base class for Mel scale conversions. + * \ingroup group_rppdefs + */ +struct BaseMelScale +{ + public: + inline RPP_HOST_DEVICE virtual Rpp32f hz_to_mel(Rpp32f hz) = 0; + inline RPP_HOST_DEVICE virtual Rpp32f mel_to_hz(Rpp32f mel) = 0; + virtual ~BaseMelScale() = default; +}; + +/*! \brief Derived class for HTK Mel scale conversions. + * \ingroup group_rppdefs + */ +struct HtkMelScale : public BaseMelScale +{ + inline RPP_HOST_DEVICE Rpp32f hz_to_mel(Rpp32f hz) { return 1127.0f * std::log(1.0f + (hz / 700.0f)); } + inline RPP_HOST_DEVICE Rpp32f mel_to_hz(Rpp32f mel) { return 700.0f * (std::exp(mel / 1127.0f) - 1.0f); } + public: + ~HtkMelScale() {}; +}; + +/*! \brief Derived class for Slaney Mel scale conversions. + * \ingroup group_rppdefs + */ +struct SlaneyMelScale : public BaseMelScale +{ + const Rpp32f freqLow = 0; + const Rpp32f fsp = 66.666667f; + const Rpp32f minLogHz = 1000.0; + const Rpp32f minLogMel = (minLogHz - freqLow) / fsp; + const Rpp32f stepLog = 0.068751777; // Equivalent to std::log(6.4) / 27.0; + + const Rpp32f invMinLogHz = 0.001f; + const Rpp32f invStepLog = 1.0f / stepLog; + const Rpp32f invFsp = 1.0f / fsp; + + inline RPP_HOST_DEVICE Rpp32f hz_to_mel(Rpp32f hz) + { + Rpp32f mel = 0.0f; + if (hz >= minLogHz) + mel = minLogMel + std::log(hz * invMinLogHz) * invStepLog; + else + mel = (hz - freqLow) * invFsp; + + return mel; + } + + inline RPP_HOST_DEVICE Rpp32f mel_to_hz(Rpp32f mel) + { + Rpp32f hz = 0.0f; + if (mel >= minLogMel) + hz = minLogHz * std::exp(stepLog * (mel - minLogMel)); + else + hz = freqLow + mel * fsp; + return hz; + } + public: + ~SlaneyMelScale() {}; +}; + /******************** HOST memory typedefs ********************/ /*! \brief RPP HOST 32-bit float memory diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h index d3e78dbb5..0401fec14 100644 --- a/include/rppt_tensor_audio_augmentations.h +++ b/include/rppt_tensor_audio_augmentations.h @@ -229,6 +229,28 @@ RppStatus rppt_spectrogram_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_ */ RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDims, Rpp32f maxFreq, Rpp32f minFreq, RpptMelScaleFormula melFormula, Rpp32s numFilter, Rpp32f sampleRate, bool normalize, rppHandle_t rppHandle); +#ifdef GPU_SUPPORT +/*! \brief Mel filter bank augmentation on HIP backend + * \details Mel filter bank augmentation for audio data + * \param[in] srcPtr source tensor in HIP memory + * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT) + * \param[out] dstPtr destination tensor in HIP memory + * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT) + * \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) + * \param[in] maxFreq maximum frequency if not provided maxFreq = sampleRate / 2 + * \param[in] minFreq minimum frequency + * \param[in] melFormula formula used to convert frequencies from hertz to mel and from mel to hertz (SLANEY / HTK) + * \param[in] numFilter number of mel filters + * \param[in] sampleRate sampling rate of the audio + * \param[in] normalize boolean variable that determine whether to normalize weights / not + * \param[in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_mel_filter_bank_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDims, Rpp32f maxFreq, Rpp32f minFreq, RpptMelScaleFormula melFormula, Rpp32s numFilter, Rpp32f sampleRate, bool normalize, rppHandle_t rppHandle); +#endif + /*! \brief Resample augmentation on HOST backend * \details Resample augmentation for audio data * \param [in] srcPtr source tensor in HOST memory diff --git a/src/modules/cpu/kernel/mel_filter_bank.hpp b/src/modules/cpu/kernel/mel_filter_bank.hpp index 9cc6d26d2..a8d430d0d 100644 --- a/src/modules/cpu/kernel/mel_filter_bank.hpp +++ b/src/modules/cpu/kernel/mel_filter_bank.hpp @@ -26,58 +26,6 @@ SOFTWARE. #include "rpp_cpu_simd.hpp" #include "rpp_cpu_common.hpp" -struct BaseMelScale -{ - public: - virtual Rpp32f hz_to_mel(Rpp32f hz) = 0; - virtual Rpp32f mel_to_hz(Rpp32f mel) = 0; - virtual ~BaseMelScale() = default; -}; - -struct HtkMelScale : public BaseMelScale -{ - Rpp32f hz_to_mel(Rpp32f hz) { return 1127.0f * std::log(1.0f + (hz / 700.0f)); } - Rpp32f mel_to_hz(Rpp32f mel) { return 700.0f * (std::exp(mel / 1127.0f) - 1.0f); } - public: - ~HtkMelScale() {}; -}; - -struct SlaneyMelScale : public BaseMelScale -{ - const Rpp32f freqLow = 0; - const Rpp32f fsp = 200.0 / 3.0; - const Rpp32f minLogHz = 1000.0; - const Rpp32f minLogMel = (minLogHz - freqLow) / fsp; - const Rpp32f stepLog = 0.068751777; // Equivalent to std::log(6.4) / 27.0; - - const Rpp32f invMinLogHz = 1.0f / 1000.0; - const Rpp32f invStepLog = 1.0f / stepLog; - const Rpp32f invFsp = 1.0f / fsp; - - Rpp32f hz_to_mel(Rpp32f hz) - { - Rpp32f mel = 0.0f; - if (hz >= minLogHz) - mel = minLogMel + std::log(hz * invMinLogHz) * invStepLog; - else - mel = (hz - freqLow) * invFsp; - - return mel; - } - - Rpp32f mel_to_hz(Rpp32f mel) - { - Rpp32f hz = 0.0f; - if (mel >= minLogMel) - hz = minLogHz * std::exp(stepLog * (mel - minLogMel)); - else - hz = freqLow + mel * fsp; - return hz; - } - public: - ~SlaneyMelScale() {}; -}; - RppStatus mel_filter_bank_host_tensor(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32f *dstPtr, @@ -106,7 +54,7 @@ RppStatus mel_filter_bank_host_tensor(Rpp32f *srcPtr, Rpp32u batchSize = srcDescPtr->n; Rpp32f *scratchMem = handle.GetInitHandle()->mem.mcpu.scratchBufferHost; - Rpp32f maxFreq = sampleRate / 2; + Rpp32f maxFreq = (maxFreqVal == 0) ? sampleRate / 2 : maxFreqVal; Rpp32f minFreq = minFreqVal; // Convert lower, higher frequencies to mel scale and find melStep diff --git a/src/modules/hip/handlehip.cpp b/src/modules/hip/handlehip.cpp index 339fb6b05..96a353b3b 100644 --- a/src/modules/hip/handlehip.cpp +++ b/src/modules/hip/handlehip.cpp @@ -240,11 +240,11 @@ struct HandleImpl CHECK_RETURN_STATUS(hipMalloc(&(this->initHandle->mem.mgpu.rgbArr.rgbmem), sizeof(RpptRGB) * this->nBatchSize)); - /* (600000 + 293 + 128) * 128 - Maximum scratch memory required for Non Silent Region Detection HIP kernel used in RNNT training (uses a batchsize 128) + /* (600000 + 293 + 192) * 192 - Maximum scratch memory required for Non Silent Region Detection HIP kernel used in RNNT training (uses a batchsize 192) - 600000 is the maximum size that will be required for MMS buffer based on Librispeech dataset - 293 is the size required for storing reduction outputs for 600000 size sample - - 128 is the size required for storing cutOffDB values for batch size 128 */ - CHECK_RETURN_STATUS(hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 76853888)); + - 192 is the size required for storing cutOffDB values for batch size 192 */ + CHECK_RETURN_STATUS(hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 115293120)); CHECK_RETURN_STATUS(hipHostMalloc(&(this->initHandle->mem.mgpu.scratchBufferPinned.floatmem), sizeof(Rpp32f) * 8294400)); // 3840 x 2160 } }; diff --git a/src/modules/hip/hip_tensor_audio_augmentations.hpp b/src/modules/hip/hip_tensor_audio_augmentations.hpp index ffbcdafb5..f97212d96 100644 --- a/src/modules/hip/hip_tensor_audio_augmentations.hpp +++ b/src/modules/hip/hip_tensor_audio_augmentations.hpp @@ -27,6 +27,7 @@ SOFTWARE. #include "kernel/non_silent_region_detection.hpp" #include "kernel/down_mixing.hpp" +#include "kernel/mel_filter_bank.hpp" #include "kernel/pre_emphasis_filter.hpp" #include "kernel/to_decibels.hpp" #include "kernel/resample.hpp" diff --git a/src/modules/hip/kernel/mel_filter_bank.hpp b/src/modules/hip/kernel/mel_filter_bank.hpp new file mode 100644 index 000000000..9b2eca0d0 --- /dev/null +++ b/src/modules/hip/kernel/mel_filter_bank.hpp @@ -0,0 +1,208 @@ +#include +#include "rpp_hip_common.hpp" + +/* +This kernel transforms the 2D spectrogram output into a Mel-scaled output based on the number of filters (numFilter) and applies optional normalization. + +Mel Filter Bank Transformation: + +Input: A 2D spectrogram of dimensions (numBins, numTimeFrames), where numBins is the number of FFT frequency bins (typically nfft / 2 + 1), and numTimeFrames represents the temporal frames. +Output: A 2D Mel-scaled output of dimensions (numFilter, numTimeFrames), where numFilter is the number of desired Mel filter banks, each corresponding to a range of FFT frequency bins. + +Key Parameters: +numFilter: Number of Mel filter banks. +normalize flag: Whether to apply normalization to the filter bank. +melFormula: Choice of Mel scale formula (HTK or Slaney). +maxFreq and minFreq: Frequency range for the Mel filter banks. + + +Preprocessing: +Before the kernel is launched, Three arrays are precomputed to store the filter intervals, normalization factors, and weights: + +Compute Intervals: +For each Mel filter, compute the frequency intervals (start and end FFT bins) that the filter spans. This is based on the Mel scale conversion of frequency ranges and the relationship between FFT bin indices and actual frequencies. + interval = ceil(f1 / hzStep), +where hzStep is the frequency of the FFT bins (based on the sample rate and nfft). + +Compute Normalization Factors: +If normalize is enabled, compute normalization factors for each filter. This ensures that each filter captures a normalized energy from its frequency interval. + normFactor = 2 / (f2 - f0), +where f0 and f2 are adjacent frequencies defining the boundaries of the filter. + +Compute Filter Weights: +The weights applied to FFT bins in each interval are precomputed, separated into two phases: weights up and weights down. +Weights up increase linearly from the start of the interval to the center. +Weights down decrease linearly from the center of the interval to the end. + weightsUp = (f1 - fftBinStart * hzStep) / (f1 - f0), + weightsDown = (f1 - fIter) * slope, +Kernel Logic: +The kernel applies the Mel filter bank transformation to the spectrogram data for each time frame and each Mel filter. + +Steps in Kernel: +In the first interval, the weights increase linearly from 0 to 1. Apply these weights up to the corresponding FFT bins and accumulate the results into the destination value dstVal. + dstVal += srcVal * weightUp, + where weightUp = (1.0 - weightDown). + +In the second interval, the weights decrease linearly from 1 to 0. Apply these weights down to the FFT bins and accumulate the results into dstVal. + dstVal += srcVal * weightDown, + +Once both intervals have been processed, store the accumulated value dstVal in the output buffer for the current (Mel filter, time frame). +*/ + +__device__ __forceinline__ void compute_mel(float *srcPtr, int melBin, float *weightsDown, int *intervals, int2 fftStrides, float normFactor, float &dstVal) +{ + dstVal = 0; + //start and end FFT bin indices for the current mel bin + int fftbin = intervals[melBin]; + int fftBinEnd = intervals[melBin + 1]; + + float *srcPtrTemp = srcPtr + fftbin * fftStrides.x + fftStrides.y; + // Process the first interval of FFT bins, applying the weights up + for (; fftbin < fftBinEnd; fftbin++, srcPtrTemp += fftStrides.x) + { + float weightUp = 1.0f - weightsDown[fftbin]; + weightUp *= normFactor; + dstVal += *srcPtrTemp * weightUp; + } + + fftBinEnd = intervals[melBin + 2]; // Update the end FFT bin index for the next interval + srcPtrTemp = srcPtr + fftbin * fftStrides.x + fftStrides.y; + + // Process the second interval of FFT bins, applying the weights down + for (; fftbin < fftBinEnd; fftbin++, srcPtrTemp += fftStrides.x) + { + float weightDown = weightsDown[fftbin]; + weightDown *= normFactor; + dstVal += *srcPtrTemp * weightDown; + } +} + +__global__ void mel_filter_bank_tensor(float *srcPtr, + uint2 srcStridesNH, + float *dstPtr, + uint2 dstStridesNH, + int *srcDimsTensor, + int numFilter, + bool normalize, + float *normFactors, + float *weightsDown, + int *intervals) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if (id_x >= srcDimsTensor[id_z * 2 + 1] || id_y >= numFilter) + return; + + uint dstIdx = id_z * dstStridesNH.x + id_y * dstStridesNH.y + id_x; + uint srcIdx = id_z * srcStridesNH.x; + + float normFactor = (normalize) ? normFactors[id_y] : 1; + compute_mel(srcPtr + srcIdx, id_y, weightsDown, intervals, make_int2(srcStridesNH.y, id_x), normFactor, dstPtr[dstIdx]); +} + +RppStatus hip_exec_mel_filter_bank_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s* srcDimsTensor, + Rpp32f maxFreqVal, + Rpp32f minFreqVal, + RpptMelScaleFormula melFormula, + Rpp32s numFilter, + Rpp32f sampleRate, + bool normalize, + rpp::Handle& handle) +{ + // Create an instance of the MelScale class based on the chosen formula + BaseMelScale *melScalePtr; + switch (melFormula) + { + case RpptMelScaleFormula::HTK: + melScalePtr = new HtkMelScale; + break; + case RpptMelScaleFormula::SLANEY: + default: + melScalePtr = new SlaneyMelScale(); + break; + } + + Rpp32f maxFreq = (maxFreqVal == 0) ? sampleRate / 2 : maxFreqVal; + Rpp32f minFreq = minFreqVal; + + // Convert the frequency range to Mel scale and compute Mel step size + Rpp64f melLow = melScalePtr->hz_to_mel(minFreq); + Rpp64f melHigh = melScalePtr->hz_to_mel(maxFreq); + Rpp64f melStep = (melHigh - melLow) / (numFilter + 1); + + Rpp32f *scratchMem = handle.GetInitHandle()->mem.mgpu.scratchBufferPinned.floatmem; + Rpp32f *normFactors = scratchMem; + Rpp32f *weightsDown = scratchMem + numFilter; + Rpp32s *intervals = reinterpret_cast(weightsDown + srcDescPtr->h); + + // parameters for FFT and frequency bins + Rpp32s nfft = (srcDescPtr->h - 1) * 2; + Rpp32s numBins = nfft / 2 + 1; + Rpp64f hzStep = static_cast(sampleRate) / nfft; + Rpp64f invHzStep = 1.0 / hzStep; + + // start and end bins for the Mel filter bank + Rpp32s fftBinStart = std::ceil(minFreq * invHzStep); + Rpp32s fftBinEnd = std::ceil(maxFreq * invHzStep); + fftBinEnd = std::min(fftBinEnd, numBins); + + // Initialize arrays used for Mel filter bank computation + std::fill(normFactors, normFactors + numFilter, 1.0f); + memset(weightsDown, 0, sizeof(srcDescPtr->h * sizeof(Rpp32f))); + std::fill(intervals, intervals + numFilter + 2, -1); + + // Compute Mel filter weights and intervals + Rpp32s fftBin = fftBinStart; + Rpp64f mel0 = melLow, mel1 = melLow + melStep; + Rpp64f fIter = fftBin * hzStep; + + intervals[0] = fftBinStart; + intervals[numFilter + 1] = fftBinEnd; + + for (int interval = 1, index = 0; index < numFilter + 1; interval++, index++, mel0 = mel1, mel1 += melStep) + { + Rpp64f f0 = melScalePtr->mel_to_hz(mel0); + Rpp64f f1 = melScalePtr->mel_to_hz(index == numFilter ? melHigh : mel1); + Rpp64f slope = 1.0 / (f1 - f0); + intervals[interval] = std::ceil(f1 / hzStep); + + if (normalize && index < numFilter) + { + Rpp64f f2 = melScalePtr->mel_to_hz(mel1 + melStep); + normFactors[index] = 2.0 / (f2 - f0); + } + + // Compute weights for each filter bank + for (; fftBin < fftBinEnd && fIter < f1; fftBin++, fIter = fftBin * hzStep) { + weightsDown[fftBin] = (f1 - fIter) * slope; + } + } + + Rpp32s globalThreads_x = dstDescPtr->w; // number of frequency bins (numBins) + Rpp32s globalThreads_y = dstDescPtr->h; // number of time frames + Rpp32s globalThreads_z = dstDescPtr->n; // batch size + hipLaunchKernelGGL(mel_filter_bank_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + srcDimsTensor, + numFilter, + normalize, + normFactors, + weightsDown, + intervals); + + delete melScalePtr; + return RPP_SUCCESS; +} diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp index 7ab0bcccb..c6f45ace6 100644 --- a/src/modules/rppt_tensor_audio_augmentations.cpp +++ b/src/modules/rppt_tensor_audio_augmentations.cpp @@ -225,6 +225,12 @@ RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, { if (srcDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_SRC_LAYOUT; if (dstDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_DST_LAYOUT; + // Disabled this check for now. + // This check will be re-enabled when the numDims based changes are added in MIVisionX */ + // if (maxFreq < 0 || maxFreq > sampleRate / 2) + // return RPP_ERROR_INVALID_ARGUMENTS; + // if (minFreq < 0 || minFreq > sampleRate / 2) + // return RPP_ERROR_INVALID_ARGUMENTS; if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) { @@ -443,6 +449,57 @@ RppStatus rppt_pre_emphasis_filter_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** mel_filter_bank ********************/ + +RppStatus rppt_mel_filter_bank_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s* srcDimsTensor, + Rpp32f maxFreq, + Rpp32f minFreq, + RpptMelScaleFormula melFormula, + Rpp32s numFilter, + Rpp32f sampleRate, + bool normalize, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims + if (tensorDims != 2) + return RPP_ERROR_INVALID_SRC_DIMS; + if (srcDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_SRC_LAYOUT; + if (dstDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_DST_LAYOUT; + if (maxFreq < 0 || maxFreq > sampleRate / 2) + return RPP_ERROR_INVALID_ARGUMENTS; + if (minFreq < 0 || minFreq > sampleRate / 2) + return RPP_ERROR_INVALID_ARGUMENTS; + + if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + return hip_exec_mel_filter_bank_tensor(static_cast(srcPtr), + srcDescPtr, + static_cast(dstPtr), + dstDescPtr, + srcDimsTensor, + maxFreq, + minFreq, + melFormula, + numFilter, + sampleRate, + normalize, + rpp::deref(rppHandle)); + } + else + { + return RPP_ERROR_NOT_IMPLEMENTED; + } + +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + /******************** resample ********************/ RppStatus rppt_resample_gpu(RppPtr_t srcPtr, diff --git a/utilities/test_suite/HIP/Tensor_audio_hip.cpp b/utilities/test_suite/HIP/Tensor_audio_hip.cpp index 11bb6d59b..0139849ba 100644 --- a/utilities/test_suite/HIP/Tensor_audio_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_audio_hip.cpp @@ -112,8 +112,16 @@ int main(int argc, char **argv) } set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes); // set buffer sizes for src/dst - iBufferSize = static_cast(srcDescPtr->h) * static_cast(srcDescPtr->w) * static_cast(srcDescPtr->c) * static_cast(srcDescPtr->n); - oBufferSize = static_cast(dstDescPtr->h) * static_cast(dstDescPtr->w) * static_cast(dstDescPtr->c) * static_cast(dstDescPtr->n); + if(testCase == 7) + { + iBufferSize = (Rpp64u)MEL_FILTER_BANK_MAX_HEIGHT * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n; + oBufferSize = (Rpp64u)MEL_FILTER_BANK_MAX_HEIGHT * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n; + } + else + { + iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n; + oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n; + } // compute maximum possible buffer size of resample unsigned long long resampleMaxBufferSize = dstDescPtr->n * dstDescPtr->strides.nStride * 1.15; @@ -289,6 +297,32 @@ int main(int argc, char **argv) break; } + case 7: + { + testCaseName = "mel_filter_bank"; + + Rpp32f sampleRate = 16000; + Rpp32f minFreq = 0.0; + Rpp32f maxFreq = sampleRate / 2; + RpptMelScaleFormula melFormula = RpptMelScaleFormula::SLANEY; + Rpp32s numFilter = 80; + bool normalize = true; + srcDimsTensor[0] = 257; + srcDimsTensor[1] = 225; + srcDimsTensor[2] = 257; + srcDimsTensor[3] = 211; + srcDimsTensor[4] = 257; + srcDimsTensor[5] = 214; + + init_mel_filter_bank(&inputf32, &outputf32, srcDescPtr, dstDescPtr, dstDims, offsetInBytes, numFilter, batchSize, srcDimsTensor, scriptPath, testType); + + CHECK_RETURN_STATUS(hipMemcpy(d_inputf32, inputf32, iBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice)); + + startWallTime = omp_get_wtime(); + rppt_mel_filter_bank_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, srcDimsTensor, maxFreq, minFreq, melFormula, numFilter, sampleRate, normalize, handle); + + break; + } default: { missingFuncFlag = 1; diff --git a/utilities/test_suite/HIP/runAudioTests.py b/utilities/test_suite/HIP/runAudioTests.py index 82b4bb1ec..c64286276 100644 --- a/utilities/test_suite/HIP/runAudioTests.py +++ b/utilities/test_suite/HIP/runAudioTests.py @@ -36,8 +36,7 @@ outFolderPath = os.getcwd() buildFolderPath = os.getcwd() caseMin = 0 -caseMax = 6 - +caseMax = 7 # Get a list of log files based on a flag for preserving output def get_log_file_list(): @@ -123,6 +122,7 @@ def rpp_test_suite_parser_and_validator(): parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests") parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )") parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.") + print_case_list(audioAugmentationMap, "HIP", parser) args = parser.parse_args() # check if the folder exists @@ -222,7 +222,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '3', '6'] +supportedCaseList = ['0', '1', '2', '3', '6', '7'] if qaMode and batchSize != 3: print("QA tests can only run with a batch size of 3.") exit(0) diff --git a/utilities/test_suite/HOST/Tensor_audio_host.cpp b/utilities/test_suite/HOST/Tensor_audio_host.cpp index 6981802f4..08200428a 100644 --- a/utilities/test_suite/HOST/Tensor_audio_host.cpp +++ b/utilities/test_suite/HOST/Tensor_audio_host.cpp @@ -125,9 +125,16 @@ int main(int argc, char **argv) descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1]; } - // set buffer sizes for src/dst - iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n; - oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n; + if(testCase == 7) + { + iBufferSize = (Rpp64u)MEL_FILTER_BANK_MAX_HEIGHT * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n; + oBufferSize = (Rpp64u)MEL_FILTER_BANK_MAX_HEIGHT * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n; + } + else + { + iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n; + oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n; + } // compute maximum possible buffer size of resample unsigned long long resampleMaxBufferSize = dstDescPtr->n * dstDescPtr->strides.nStride * 1.15; @@ -364,41 +371,10 @@ int main(int argc, char **argv) RpptMelScaleFormula melFormula = RpptMelScaleFormula::SLANEY; Rpp32s numFilter = 80; bool normalize = true; - Rpp32s srcDimsTensor[] = {257, 225, 257, 211, 257, 214}; // (height, width) for each tensor in a batch for given QA inputs. - // Accepts outputs from FT layout of Spectrogram for QA - srcDescPtr->layout = dstDescPtr->layout = RpptLayout::NFT; - - maxDstHeight = 0; - maxDstWidth = 0; - maxSrcHeight = 0; - maxSrcWidth = 0; - for(int i = 0, j = 0; i < batchSize; i++, j += 2) - { - maxSrcHeight = std::max(maxSrcHeight, (int)srcDimsTensor[j]); - maxSrcWidth = std::max(maxSrcWidth, (int)srcDimsTensor[j + 1]); - dstDims[i].height = numFilter; - dstDims[i].width = srcDimsTensor[j + 1]; - maxDstHeight = std::max(maxDstHeight, (int)dstDims[i].height); - maxDstWidth = std::max(maxDstWidth, (int)dstDims[i].width); - } - srcDescPtr->h = maxSrcHeight; - srcDescPtr->w = maxSrcWidth; - dstDescPtr->h = maxDstHeight; - dstDescPtr->w = maxDstWidth; - - set_audio_descriptor_dims_and_strides_nostriding(srcDescPtr, batchSize, maxSrcHeight, maxSrcWidth, maxSrcChannels, offsetInBytes); - set_audio_descriptor_dims_and_strides_nostriding(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes); - srcDescPtr->numDims = 3; - dstDescPtr->numDims = 3; - - // Set buffer sizes for src/dst - unsigned long long spectrogramBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)srcDescPtr->c * (unsigned long long)srcDescPtr->n; - unsigned long long melFilterBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)dstDescPtr->c * (unsigned long long)dstDescPtr->n; - inputf32 = (Rpp32f *)realloc(inputf32, spectrogramBufferSize * sizeof(Rpp32f)); - outputf32 = (Rpp32f *)realloc(outputf32, melFilterBufferSize * sizeof(Rpp32f)); + // (height, width) for each tensor in a batch for given QA inputs. + Rpp32s srcDimsTensor[] = {257, 225, 257, 211, 257, 214}; - // Read source data - read_from_bin_file(inputf32, srcDescPtr, srcDimsTensor, "spectrogram", scriptPath); + init_mel_filter_bank(&inputf32, &outputf32, srcDescPtr, dstDescPtr, dstDims, offsetInBytes, numFilter, batchSize, srcDimsTensor, scriptPath, testType); startWallTime = omp_get_wtime(); rppt_mel_filter_bank_host(inputf32, srcDescPtr, outputf32, dstDescPtr, srcDimsTensor, maxFreq, minFreq, melFormula, numFilter, sampleRate, normalize, handle); diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py index e24ee73f6..31769a768 100644 --- a/utilities/test_suite/common.py +++ b/utilities/test_suite/common.py @@ -93,7 +93,7 @@ 4: ["spectrogram", "HOST"], 5: ["slice", "HOST"], 6: ["resample", "HOST", "HIP"], - 7: ["mel_filter_bank", "HOST"] + 7: ["mel_filter_bank", "HOST", "HIP"] } voxelAugmentationMap = { diff --git a/utilities/test_suite/rpp_test_suite_audio.h b/utilities/test_suite/rpp_test_suite_audio.h index de1ebff9a..5291a6bfa 100644 --- a/utilities/test_suite/rpp_test_suite_audio.h +++ b/utilities/test_suite/rpp_test_suite_audio.h @@ -35,6 +35,8 @@ SOFTWARE. #include using namespace std; +#define MEL_FILTER_BANK_MAX_HEIGHT 257 // Maximum height for mel filter bank set to 257 to ensure compatibility with test configuration + std::map audioAugmentationMap = { {0, "non_silent_region_detection"}, @@ -55,6 +57,18 @@ std::map> NonSilentRegionReferenceOutputs = {"sample3", {0, 34160}} }; +// Cutoff values for audio HIP kernels +std::map audioHIPCutOff = +{ + {"to_decibels", 1e-6}, + {"pre_emphasis_filter", 1e-6}, + {"down_mixing", 1e-6}, + {"spectrogram", 1e-3}, + {"slice", 1e-20}, + {"resample", 1e-6}, + {"mel_filter_bank", 1e-5} +}; + // sets descriptor dimensions and strides of src/dst inline void set_audio_descriptor_dims_and_strides(RpptDescPtr descPtr, int batchSize, int maxHeight, int maxWidth, int maxChannels, int offsetInBytes) { @@ -114,6 +128,7 @@ inline void set_audio_max_dimensions(vector audioFilesPath, int& maxWidt } } +// Read a batch of audio samples and fill dims void read_audio_batch_and_fill_dims(RpptDescPtr descPtr, Rpp32f *inputf32, vector audioFilesPath, int iterCount, Rpp32s *srcLengthTensor, Rpp32s *channelsTensor) { auto fileIndex = iterCount * descPtr->n; @@ -151,10 +166,10 @@ void read_audio_batch_and_fill_dims(RpptDescPtr descPtr, Rpp32f *inputf32, vecto } } -void read_from_bin_file(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcDims, string testCase, string scriptPath) +void read_from_bin_file(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcDims, string testCase, string scriptPath, int numSamples) { // read data from golden outputs - Rpp64u oBufferSize = srcDescPtr->n * srcDescPtr->strides.nStride; + Rpp64u oBufferSize = numSamples * srcDescPtr->strides.nStride; Rpp32f *refInput = static_cast(malloc(oBufferSize * sizeof(float))); string outFile = scriptPath + "/../REFERENCE_OUTPUTS_AUDIO/" + testCase + "/" + testCase + ".bin"; std::fstream fin(outFile, std::ios::in | std::ios::binary); @@ -176,7 +191,7 @@ void read_from_bin_file(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcDims, std::cout<<"\nCould not open the reference output. Please check the path specified\n"; return; } - for (int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + for (int batchCount = 0; batchCount < numSamples; batchCount++) { Rpp32f *srcPtrCurrent = srcPtr + batchCount * srcDescPtr->strides.nStride; Rpp32f *refPtrCurrent = refInput + batchCount * srcDescPtr->strides.nStride; @@ -195,6 +210,34 @@ void read_from_bin_file(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcDims, free(refInput); } +//replicate the last sample buffer for the remaining samples +void replicate_last_sample_mel_filter_bank(Rpp32f *srcPtr, int numSamples, unsigned long sampleSize, int batchSize) +{ + if (batchSize <= numSamples) + return; + + Rpp32f *lastSample = srcPtr + (numSamples - 1) * sampleSize; + for (int i = numSamples; i < batchSize; i++) + { + Rpp32f *sample = srcPtr + i * sampleSize; + memcpy(sample, lastSample, sampleSize * sizeof(Rpp32f)); + } +} + +// Replicate the dimensions of the last sample to fill the remaining batch samples. +void replicate_src_dims_to_fill_batch(Rpp32s *srcDimsTensor, int numSamples, int batchSize) +{ + if (batchSize <= numSamples) + return; + + for (int i = numSamples; i < batchSize; i++) + { + srcDimsTensor[i * 2] = srcDimsTensor[(numSamples - 1) * 2]; + srcDimsTensor[i * 2 + 1] = srcDimsTensor[(numSamples - 1) * 2 + 1]; + } +} + +// Compares output with reference outputs and validates QA void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dstDims, string testCase, string dst, string scriptPath, string backend) { fstream refFile; @@ -223,7 +266,7 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst std::cout<<"\nCould not open the reference output. Please check the path specified\n"; return; } - double cutoff = (backend == "HOST") ? 1e-20 : 1e-6; + double cutoff = (backend == "HOST") ? 1e-20 : audioHIPCutOff[testCase]; // iterate over all samples in a batch and compare with reference outputs for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) @@ -249,6 +292,8 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst bool invalidComparision = ((outVal == 0.0f) && (refVal != 0.0f)); if (!invalidComparision && abs(outVal - refVal) < cutoff) matchedIndices += 1; + else + std::cout<<"\n mismatch "<<" row "< audioNames, string dst) { int fileMatch = 0; @@ -362,5 +408,46 @@ inline void windowed_sinc(RpptResamplingWindow &window, Rpp32s coeffs, Rpp32s lo window.center = center + 1; window.scale = 1 / scale; window.pCenter = _mm_set1_ps(window.center); - window.pScale = _mm_set1_ps(window.scale); -} \ No newline at end of file + window.pScale = _mm_set1_ps(window.scale); +} + +// Mel filter bank initializer for unit and performance testing +void inline init_mel_filter_bank(Rpp32f **inputf32, Rpp32f **outputf32, RpptDescPtr srcDescPtr, RpptDescPtr dstDescPtr, RpptImagePatch *dstDims, Rpp32u offsetInBytes, Rpp32s numFilter, int batchSize, Rpp32s *srcDimsTensor, string scriptPath, int testType) +{ + // Accepts outputs from FT layout of Spectrogram for QA + srcDescPtr->layout = dstDescPtr->layout = RpptLayout::NFT; + + int maxDstHeight = 0; + int maxDstWidth = 0; + int maxSrcHeight = 0; + int maxSrcWidth = 0; + int numSamples = 3; + for(int i = 0, j = 0; i < numSamples; i++, j += 2) + { + maxSrcHeight = std::max(maxSrcHeight, (int)srcDimsTensor[j]); + maxSrcWidth = std::max(maxSrcWidth, (int)srcDimsTensor[j + 1]); + dstDims[i].height = numFilter; + dstDims[i].width = srcDimsTensor[j + 1]; + maxDstHeight = std::max(maxDstHeight, (int)dstDims[i].height); + maxDstWidth = std::max(maxDstWidth, (int)dstDims[i].width); + } + srcDescPtr->h = maxSrcHeight; + srcDescPtr->w = maxSrcWidth; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + + set_audio_descriptor_dims_and_strides_nostriding(srcDescPtr, batchSize, maxSrcHeight, maxSrcWidth, 1, offsetInBytes); + set_audio_descriptor_dims_and_strides_nostriding(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, 1, offsetInBytes); + srcDescPtr->numDims = 3; + dstDescPtr->numDims = 3; + + unsigned long sampleSize = static_cast(srcDescPtr->h) * static_cast(srcDescPtr->w) * static_cast(srcDescPtr->c); + + // Read source data + read_from_bin_file(*inputf32, srcDescPtr, srcDimsTensor, "spectrogram", scriptPath, numSamples); + if(testType) + { + replicate_last_sample_mel_filter_bank(*inputf32, numSamples, sampleSize, batchSize); + replicate_src_dims_to_fill_batch(srcDimsTensor, numSamples, batchSize); + } +}