From a8862adfad66d86bdb4bdec3c4c7206f57803882 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Fri, 10 Jan 2025 08:18:15 +0000 Subject: [PATCH 01/14] [GPU]: Added temporary benchmark. --- .../unit/test_cases/stft_gpu_benchamrk.cpp | 169 ++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp new file mode 100644 index 00000000000000..a3e70194fe0a90 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp @@ -0,0 +1,169 @@ +// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "test_utils.h" + +using namespace cldnn; +using namespace ::tests; + +// PK: TEMPORARY BENCHMARK, WILL BE REMOVED BEFORE MERGING TO MASTER. + +namespace { + +namespace helpers { +// TODO: Move to common place. + +// Converts float vector to another type vector. +template +std::vector ConverFloatVector(const std::vector& vec) { + std::vector ret; + ret.reserve(vec.size()); + for (const auto& val : vec) { + ret.push_back(T(val)); + } + return ret; +} + +// Allocates tensoer with given shape and data. +template +memory::ptr AllocateTensor(ov::PartialShape shape, const std::vector& data) { + const layout lo = {shape, ov::element::from(), cldnn::format::bfyx}; + EXPECT_EQ(lo.get_linear_size(), data.size()); + memory::ptr tensor = get_test_engine().allocate_memory(lo); + set_values(tensor, data); + return tensor; +} + +} // namespace helpers + +struct STFTTestParams { + ov::PartialShape signalShape; + ov::PartialShape windowShape; + ov::PartialShape outputShape; + int64_t frameSize; + int64_t frameStep; + bool transposedFrames; + std::vector signalData; + std::vector windowData; + std::vector expectedOutput; + std::string testcaseName; +}; + +class stft_benchmark : public ::testing::Test { +public: + struct STFTInferenceParams { + bool transposedFrames; + memory::ptr signal; + memory::ptr window; + memory::ptr frameSize; + memory::ptr frameStep; + memory::ptr expectedOutput; + }; + + template + STFTInferenceParams PrepareInferenceParams(const STFTTestParams& testParam) { + using T = typename ov::element_type_traits::value_type; + STFTInferenceParams ret; + + ret.transposedFrames = testParam.transposedFrames; + + ret.signal = + helpers::AllocateTensor(testParam.signalShape, helpers::ConverFloatVector(testParam.signalData)); + ret.window = + helpers::AllocateTensor(testParam.windowShape, helpers::ConverFloatVector(testParam.windowData)); + ret.frameStep = helpers::AllocateTensor({}, {testParam.frameStep}); + ret.frameSize = helpers::AllocateTensor({}, {testParam.frameSize}); + + return ret; + } + + void Execute(const STFTInferenceParams& params) { + // Prepare the network. + + auto scalar_layout = params.frameSize->get_layout(); + scalar_layout.set_partial_shape({}); + + topology topology; + topology.add(input_layout("signal", params.signal->get_layout())); + topology.add(input_layout("window", params.window->get_layout())); + topology.add(input_layout("frameSize", scalar_layout)); + topology.add(input_layout("frameStep", scalar_layout)); + topology.add(STFT("stft", + input_info("signal"), + input_info("window"), + input_info("frameSize"), + input_info("frameStep"), + params.transposedFrames)); + + auto stream = get_test_stream_ptr(get_test_default_config(engine_)); + cldnn::network::ptr network = get_network(engine_, topology, get_test_default_config(engine_), stream, false); + + network->set_input_data("signal", params.signal); + network->set_input_data("window", params.window); + network->set_input_data("frameSize", params.frameSize); + network->set_input_data("frameStep", params.frameStep); + + // Run and check results. + const int warmup = 10; + const int run = 100; + + std::map outputs; + for (int i = 0; i < warmup; ++i) + outputs = network->execute(); + network->reset_execution(true); + + // Note: Should be based on events, this one + // also adds up kernel launch time and gpu idle time. + auto start = std::chrono::system_clock::now(); + for (int i = 0; i < run; ++i) + outputs = network->execute(); + network->reset_execution(true); + auto stop = std::chrono::system_clock::now(); + + const auto d_actual = std::chrono::duration_cast(stop - start).count(); + auto output = outputs.at("stft").get_memory(); + auto outputShape = output->get_layout().get_shape(); + std::cout << "Avg Time for output shape " << outputShape << ":" << d_actual / run << " microseconds\n\n"; + } + + template + void RunBenchmark(const ov::PartialShape& signalShape, int frameSize, int frameStep, bool transposed) { + std::cout << "Benchmark: signal shape: " << signalShape << ", frameSize: " << frameSize + << ", frameStep: " << frameStep << ", transposed: " << transposed << std::endl; + struct STFTTestParams params; + params.signalShape = signalShape; + params.windowShape = {frameSize}; + params.frameSize = frameSize; + params.frameStep = frameStep; + params.transposedFrames = transposed; + params.signalData = std::vector(ov::shape_size(params.signalShape.get_shape()), 0); + params.windowData = std::vector(ov::shape_size(params.windowShape.get_shape()), 0); + params.testcaseName = ""; + + Execute(PrepareInferenceParams(params)); + } + +private: + engine& engine_ = get_test_engine(); +}; +} // namespace + +TEST_F(stft_benchmark, DISABLED_benchmarks) { + RunBenchmark({10000}, 1000, 2, true); + RunBenchmark({10000}, 1000, 2, false); + + RunBenchmark({32768}, 2048, 512, true); + RunBenchmark({32768}, 2048, 512, false); + + RunBenchmark({10000}, 100, 2, true); + RunBenchmark({10000}, 100, 2, false); + + RunBenchmark({10000}, 1000, 200, true); + RunBenchmark({10000}, 1000, 200, false); +} From c8fc8d630790d7db19f0c3f14168d4ce0b1b2c5f Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Fri, 10 Jan 2025 13:48:31 +0000 Subject: [PATCH 02/14] [GPU]: Added stub for stft_opt kernel. --- .../kernel_selector/cl_kernels/stft_opt.cl | 93 +++++++++++++++++++ .../kernels/stft/stft_kernel_base.cpp | 8 +- .../kernels/stft/stft_kernel_base.h | 4 +- .../kernels/stft/stft_kernel_opt.cpp | 60 ++++++++++++ .../kernels/stft/stft_kernel_opt.h | 19 ++++ .../kernels/stft/stft_kernel_selector.cpp | 2 + 6 files changed, 179 insertions(+), 7 deletions(-) create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl new file mode 100644 index 00000000000000..9ab2d6838d1725 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +// alternative: https://github.com/OpenCL/ComplexMath/blob/master/clcomplex.h +typedef float2 cfloat; +#define real(a) ((a).s0) +#define imag(a) ((a).s1) +#define crmult(a, b) ((cfloat)(real(a) * (b), imag(a) * (b))) +#define cadd(a, b) ((cfloat)(real(a) + real(b), imag(a) + imag(b))) +#define csub(a, b) ((cfloat)(real(a) - real(b), imag(a) - imag(b))) +#define expmi(x) ((cfloat)(cos(x), -sin(x))) +#define czero() ((cfloat)(0)) + +// Unoptimized, the most obvious stft impl from the definition. +KERNEL(stft_ref)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* restrict signal, + const __global INPUT1_TYPE* restrict window, + const __global INPUT2_TYPE* restrict frame_size_buff, + const __global INPUT3_TYPE* restrict frame_step_buff, + __global OUTPUT_TYPE* restrict output) +{ + + const int freq_id = get_global_id(0); + const int frame_id = get_global_id(1); + const int batch = get_global_id(2); + const int frame_size = (int)frame_size_buff[0]; + const int frame_step = (int)frame_step_buff[0]; + const int window_size = INPUT1_SIZE_X; + + __local float signal_for_this_frame_shared[window_size]; + + const size_t block_size = get_local_size(0)*get_local_size(1)*get_local_size(2); + +// if(freq_id == 0 && frame_id == 0 && batch == 0) { +// printf("Printing from thread 0!\n"); +// printf("BlockSize: %i\n", get_local_size(0)*get_local_size(1)*get_local_size(2)); +// printf("Blocks: %i\n", get_num_groups(0)*get_num_groups(1)*get_num_groups(2)); + +// #if TRANSPOSE_FRAMES +// const int FREQS = OUTPUT_FEATURE_NUM; +// #else +// const int FREQS = OUTPUT_SIZE_Y; +// #endif +// printf("FREQS: %i\n", FREQS); +// } + + + // Handling case where window size is smaller than frame size. + const int start_offset = (frame_size - window_size) / 2; + + const INPUT0_TYPE* restrict signal_for_this_frame = signal + batch*INPUT0_SIZE_X + frame_id*frame_step + start_offset; + + // Preload into shared mem: + for( size_t i = get_local_linear_id(); i < window_size; i+= block_size) { + + signal_for_this_frame_shared[i] = (float)signal_for_this_frame[i]; + } + + // FT from def for single freq for given frame: + cfloat freq_val = czero(); + + // dft_power = 2*PI*(k/N) from dft def. + const float dft_power = 2.0f * M_PI_F * (float)freq_id / (float)frame_size; + + cfloat err = czero(); + for(int i = 0; i < window_size; ++i) { + const float signal_val = signal_for_this_frame_shared[i]; + const float window_val = (float)window[i]; + const float x_i = signal_val*window_val; + const cfloat e_i = expmi(dft_power*(float)(i+start_offset)); + const cfloat val_i = crmult(e_i, x_i); + + // Kahan sum algo: + const cfloat y = csub(val_i, err); + const cfloat newSum = cadd(freq_val, y); + err = csub(newSum, freq_val); + err = csub(err, y); + freq_val = newSum; + } + +#if TRANSPOSE_FRAMES + const int output_real_idx = OUTPUT_GET_INDEX(batch, freq_id, frame_id, 0); + const int output_imag_idx = OUTPUT_GET_INDEX(batch, freq_id, frame_id, 1); +#else + const int output_real_idx = OUTPUT_GET_INDEX(batch, frame_id, freq_id, 0); + const int output_imag_idx = OUTPUT_GET_INDEX(batch, frame_id, freq_id, 1); +#endif + + output[output_real_idx] = (OUTPUT_TYPE)real(freq_val); + output[output_imag_idx] = (OUTPUT_TYPE)imag(freq_val); +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.cpp index 8eb8ce36c14f2f..dd5cc745bff473 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.cpp @@ -18,9 +18,9 @@ JitConstants STFTKernelBase::GetJitConstants(const STFT_params& params) const { } void STFTKernelBase::GetUpdateDispatchDataFunc(KernelData& kd) const { - kd.update_dispatch_data_func = [](const Params& params, KernelData& kd) { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { const auto& prim_params = static_cast(params); - auto dispatchData = SetDefault(prim_params); + auto dispatchData = CalcLaunchConfig(prim_params); OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); kd.kernels[0].params.workGroups.global = dispatchData.gws; kd.kernels[0].params.workGroups.local = dispatchData.lws; @@ -28,7 +28,7 @@ void STFTKernelBase::GetUpdateDispatchDataFunc(KernelData& kd) const { }; } -STFTKernelBase::DispatchData STFTKernelBase::SetDefault(const STFT_params& params) { +CommonDispatchData STFTKernelBase::CalcLaunchConfig(const STFT_params& params) const { CommonDispatchData dispatchData; const auto inLayout = params.inputs.front().GetLayout(); const auto& output = params.outputs.front(); @@ -61,7 +61,7 @@ KernelsData STFTKernelBase::GetCommonKernelsData(const Params& params) const { const auto& prim_params = static_cast(params); - auto dispatchData = SetDefault(prim_params); + auto dispatchData = CalcLaunchConfig(prim_params); KernelData k_data = KernelData::Default(params); auto cldnn_jit = GetJitConstants(prim_params); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h index 75ad08280e6c74..e5c94526a8aedb 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h @@ -23,11 +23,9 @@ class STFTKernelBase : public KernelBaseOpenCL { public: using KernelBaseOpenCL::KernelBaseOpenCL; - using DispatchData = CommonDispatchData; - protected: JitConstants GetJitConstants(const STFT_params& params) const; - static DispatchData SetDefault(const STFT_params& params); + virtual CommonDispatchData CalcLaunchConfig(const STFT_params& params) const; KernelsData GetCommonKernelsData(const Params& params) const; void GetUpdateDispatchDataFunc(KernelData& kd) const override; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp new file mode 100644 index 00000000000000..ecb08f74dd1ea2 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp @@ -0,0 +1,60 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "stft_kernel_opt.h" + +namespace kernel_selector { +ParamsKey STFTKernelOpt::GetSupportedKey() const { + ParamsKey k; + + k.EnableInputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::INT64); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::F16); + + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + + k.EnableInputLayout(DataLayout::bfyx); + + k.EnableOutputLayout(DataLayout::bfyx); + + k.EnableBatching(); + k.EnableDifferentTypes(); + k.EnableDynamicShapesSupport(); + return k; +} + +KernelsData STFTKernelOpt::GetKernelsData(const Params& params) const { + return GetCommonKernelsData(params); +} + +KernelsPriority STFTKernelOpt::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_8; +} + +CommonDispatchData STFTKernelOpt::CalcLaunchConfig(const STFT_params& params) const { + CommonDispatchData dispatchData; + const auto& output = params.outputs.front(); + + OPENVINO_ASSERT(output.Dimentions() == 4); + OPENVINO_ASSERT(output.X().v == 2); + + std::vector> dimsByGws; + + if (params.transpose_frames) { + dispatchData.gws = {output.Feature().v, output.Y().v, output.Batch().v}; + } else { + dispatchData.gws = {output.Y().v, output.Feature().v, output.Batch().v}; + } + + const size_t threads = dispatchData.gws[0] < 32 ? dispatchData.gws[0] : 32; + + dispatchData.lws = {threads, 1, 1}; + + //std::cout << dispatchData << std::endl; + return dispatchData; +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h new file mode 100644 index 00000000000000..e338fd685e6cf4 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "stft_kernel_base.h" + +namespace kernel_selector { +class STFTKernelOpt : public STFTKernelBase { +public: + STFTKernelOpt() : STFTKernelBase("stft_opt") {} + + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + CommonDispatchData CalcLaunchConfig(const STFT_params& params) const override; + ParamsKey GetSupportedKey() const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_selector.cpp index 02edc108c2e680..7826fafd217260 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_selector.cpp @@ -4,11 +4,13 @@ #include "stft_kernel_selector.h" +#include "stft_kernel_opt.h" #include "stft_kernel_ref.h" namespace kernel_selector { STFT_kernel_selector::STFT_kernel_selector() { Attach(); + Attach(); } KernelsData STFT_kernel_selector::GetBestKernels(const Params& params) const { From 3cd11f293703d52c1b80726b41234fe3e93d3ca5 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Tue, 14 Jan 2025 12:06:56 +0000 Subject: [PATCH 03/14] [gpu]: stft_opt: Optimized block size plus caching x_i in shared mem. --- .../kernel_selector/cl_kernels/stft_opt.cl | 35 +++++++++++-------- .../kernels/stft/stft_kernel_opt.cpp | 3 +- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl index 9ab2d6838d1725..e2d194d838550e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl @@ -29,10 +29,13 @@ KERNEL(stft_ref)( const int frame_step = (int)frame_step_buff[0]; const int window_size = INPUT1_SIZE_X; - __local float signal_for_this_frame_shared[window_size]; + __local float x_i_shared[window_size]; const size_t block_size = get_local_size(0)*get_local_size(1)*get_local_size(2); + // const int bla = 1; + // const int test = sub_group_reduce_add(bla); + // if(freq_id == 0 && frame_id == 0 && batch == 0) { // printf("Printing from thread 0!\n"); // printf("BlockSize: %i\n", get_local_size(0)*get_local_size(1)*get_local_size(2)); @@ -44,20 +47,31 @@ KERNEL(stft_ref)( // const int FREQS = OUTPUT_SIZE_Y; // #endif // printf("FREQS: %i\n", FREQS); +// printf("get_sub_group_size(): %i\n", get_sub_group_size()); +// printf("get_num_sub_groups(): %i\n", get_num_sub_groups()); +// printf("test: %i\n", test); // } + // if(get_local_linear_id() == 0) { + // printf("get_group_id(0): %i \n", get_group_id(0)); + // printf("get_group_id(1): %i \n", get_group_id(1)); + // printf("get_group_id(2): %i \n", get_group_id(2)); + // } // Handling case where window size is smaller than frame size. const int start_offset = (frame_size - window_size) / 2; - const INPUT0_TYPE* restrict signal_for_this_frame = signal + batch*INPUT0_SIZE_X + frame_id*frame_step + start_offset; + const INPUT0_TYPE* restrict signal_for_this_frame = signal + batch*INPUT0_SIZE_X + frame_id*frame_step + start_offset; // Preload into shared mem: - for( size_t i = get_local_linear_id(); i < window_size; i+= block_size) { - - signal_for_this_frame_shared[i] = (float)signal_for_this_frame[i]; + for(size_t i = get_local_linear_id(); i < window_size; i+= block_size) { + const float signal_val = (float)signal_for_this_frame[i]; + const float window_val = (float)window[i]; + x_i_shared[i] = signal_val*window_val; } + barrier(CLK_LOCAL_MEM_FENCE); + // FT from def for single freq for given frame: cfloat freq_val = czero(); @@ -66,18 +80,11 @@ KERNEL(stft_ref)( cfloat err = czero(); for(int i = 0; i < window_size; ++i) { - const float signal_val = signal_for_this_frame_shared[i]; - const float window_val = (float)window[i]; - const float x_i = signal_val*window_val; + const float x_i = x_i_shared[i]; const cfloat e_i = expmi(dft_power*(float)(i+start_offset)); const cfloat val_i = crmult(e_i, x_i); - // Kahan sum algo: - const cfloat y = csub(val_i, err); - const cfloat newSum = cadd(freq_val, y); - err = csub(newSum, freq_val); - err = csub(err, y); - freq_val = newSum; + freq_val = cadd(freq_val, val_i); } #if TRANSPOSE_FRAMES diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp index ecb08f74dd1ea2..b46a51f366f33a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp @@ -49,7 +49,8 @@ CommonDispatchData STFTKernelOpt::CalcLaunchConfig(const STFT_params& params) co dispatchData.gws = {output.Y().v, output.Feature().v, output.Batch().v}; } - const size_t threads = dispatchData.gws[0] < 32 ? dispatchData.gws[0] : 32; + const int wantedThreadsPerBlock = 128; + const size_t threads = dispatchData.gws[0] < wantedThreadsPerBlock ? dispatchData.gws[0] : wantedThreadsPerBlock; dispatchData.lws = {threads, 1, 1}; From 6a12df0892406a6c22b410577365716aead842c4 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Tue, 14 Jan 2025 14:10:17 +0000 Subject: [PATCH 04/14] [GPU]: Enabled stft benchmarks, --- .../tests/unit/test_cases/stft_gpu_benchamrk.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp index a3e70194fe0a90..21b8daf295f40e 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp @@ -16,6 +16,9 @@ using namespace ::tests; namespace { +const int WARMUPS = 10; +const int RUNS = 100; + namespace helpers { // TODO: Move to common place. @@ -110,8 +113,8 @@ class stft_benchmark : public ::testing::Test { network->set_input_data("frameStep", params.frameStep); // Run and check results. - const int warmup = 10; - const int run = 100; + const int warmup = WARMUPS; + const int run = RUNS; std::map outputs; for (int i = 0; i < warmup; ++i) @@ -154,7 +157,7 @@ class stft_benchmark : public ::testing::Test { }; } // namespace -TEST_F(stft_benchmark, DISABLED_benchmarks) { +TEST_F(stft_benchmark, benchmarks) { RunBenchmark({10000}, 1000, 2, true); RunBenchmark({10000}, 1000, 2, false); From 5d0c2f5c695e2429db1909f95cdd6c3773df6080 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Thu, 16 Jan 2025 12:20:29 +0000 Subject: [PATCH 05/14] [gpu]: stft becnhmark: output handling fix. --- .../intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp index 21b8daf295f40e..6ef1a97b2320b3 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp @@ -118,18 +118,20 @@ class stft_benchmark : public ::testing::Test { std::map outputs; for (int i = 0; i < warmup; ++i) - outputs = network->execute(); + network->execute(); network->reset_execution(true); // Note: Should be based on events, this one // also adds up kernel launch time and gpu idle time. auto start = std::chrono::system_clock::now(); for (int i = 0; i < run; ++i) - outputs = network->execute(); + network->execute(); network->reset_execution(true); auto stop = std::chrono::system_clock::now(); const auto d_actual = std::chrono::duration_cast(stop - start).count(); + + outputs = network->execute(); auto output = outputs.at("stft").get_memory(); auto outputShape = output->get_layout().get_shape(); std::cout << "Avg Time for output shape " << outputShape << ":" << d_actual / run << " microseconds\n\n"; From f11c8848b95597dbd55fc50f6993251212b9a823 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Thu, 16 Jan 2025 12:30:47 +0000 Subject: [PATCH 06/14] [gpu]: stft_opt: using fast trigonometric functions plus made local buff static. --- .../intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl index e2d194d838550e..175b95d6745ba5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl @@ -9,9 +9,11 @@ typedef float2 cfloat; #define crmult(a, b) ((cfloat)(real(a) * (b), imag(a) * (b))) #define cadd(a, b) ((cfloat)(real(a) + real(b), imag(a) + imag(b))) #define csub(a, b) ((cfloat)(real(a) - real(b), imag(a) - imag(b))) -#define expmi(x) ((cfloat)(cos(x), -sin(x))) +#define expmi(x) ((cfloat)(native_cos(x), -native_sin(x))) #define czero() ((cfloat)(0)) +#define X_I_MAX_BUFFER_SIZE 2048 + // Unoptimized, the most obvious stft impl from the definition. KERNEL(stft_ref)( OPTIONAL_SHAPE_INFO_ARG @@ -29,7 +31,7 @@ KERNEL(stft_ref)( const int frame_step = (int)frame_step_buff[0]; const int window_size = INPUT1_SIZE_X; - __local float x_i_shared[window_size]; + __local float x_i_shared[X_I_MAX_BUFFER_SIZE]; const size_t block_size = get_local_size(0)*get_local_size(1)*get_local_size(2); From d59d10cb941e037c5718290dfa6e081d458daaa3 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Mon, 20 Jan 2025 13:52:41 +0000 Subject: [PATCH 07/14] [gpu]: stft: optimization, now each sub_group produces output for 4 freq at the same time. --- .../kernel_selector/cl_kernels/stft_opt.cl | 120 +++++++++--------- .../kernels/stft/stft_kernel_base.h | 2 +- .../kernels/stft/stft_kernel_opt.cpp | 31 +++-- .../kernels/stft/stft_kernel_opt.h | 1 + 4 files changed, 80 insertions(+), 74 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl index 175b95d6745ba5..87fe5696f6d646 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl @@ -2,19 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 // -// alternative: https://github.com/OpenCL/ComplexMath/blob/master/clcomplex.h -typedef float2 cfloat; -#define real(a) ((a).s0) -#define imag(a) ((a).s1) -#define crmult(a, b) ((cfloat)(real(a) * (b), imag(a) * (b))) -#define cadd(a, b) ((cfloat)(real(a) + real(b), imag(a) + imag(b))) -#define csub(a, b) ((cfloat)(real(a) - real(b), imag(a) - imag(b))) -#define expmi(x) ((cfloat)(native_cos(x), -native_sin(x))) -#define czero() ((cfloat)(0)) - -#define X_I_MAX_BUFFER_SIZE 2048 - -// Unoptimized, the most obvious stft impl from the definition. + +#define FREQS_PER_THREAD 4 + KERNEL(stft_ref)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* restrict signal, @@ -23,43 +13,24 @@ KERNEL(stft_ref)( const __global INPUT3_TYPE* restrict frame_step_buff, __global OUTPUT_TYPE* restrict output) { +#if TRANSPOSE_FRAMES + const size_t FREQS = OUTPUT_FEATURE_NUM; +#else + const size_t FREQS = OUTPUT_SIZE_Y; +#endif - const int freq_id = get_global_id(0); - const int frame_id = get_global_id(1); - const int batch = get_global_id(2); - const int frame_size = (int)frame_size_buff[0]; - const int frame_step = (int)frame_step_buff[0]; - const int window_size = INPUT1_SIZE_X; + const size_t blocksPerFreq = (FREQS + FREQ_PER_BLOCK-1)/FREQ_PER_BLOCK; + const size_t batch = get_global_id(0); + const size_t frame_id = get_group_id(1)/blocksPerFreq; + const size_t freq_start = (get_group_id(1)%blocksPerFreq)*FREQ_PER_BLOCK; + const size_t frame_size = (size_t)frame_size_buff[0]; + const size_t frame_step = (size_t)frame_step_buff[0]; + const size_t window_size = INPUT1_SIZE_X; __local float x_i_shared[X_I_MAX_BUFFER_SIZE]; const size_t block_size = get_local_size(0)*get_local_size(1)*get_local_size(2); - // const int bla = 1; - // const int test = sub_group_reduce_add(bla); - -// if(freq_id == 0 && frame_id == 0 && batch == 0) { -// printf("Printing from thread 0!\n"); -// printf("BlockSize: %i\n", get_local_size(0)*get_local_size(1)*get_local_size(2)); -// printf("Blocks: %i\n", get_num_groups(0)*get_num_groups(1)*get_num_groups(2)); - -// #if TRANSPOSE_FRAMES -// const int FREQS = OUTPUT_FEATURE_NUM; -// #else -// const int FREQS = OUTPUT_SIZE_Y; -// #endif -// printf("FREQS: %i\n", FREQS); -// printf("get_sub_group_size(): %i\n", get_sub_group_size()); -// printf("get_num_sub_groups(): %i\n", get_num_sub_groups()); -// printf("test: %i\n", test); -// } - - // if(get_local_linear_id() == 0) { - // printf("get_group_id(0): %i \n", get_group_id(0)); - // printf("get_group_id(1): %i \n", get_group_id(1)); - // printf("get_group_id(2): %i \n", get_group_id(2)); - // } - // Handling case where window size is smaller than frame size. const int start_offset = (frame_size - window_size) / 2; @@ -74,29 +45,54 @@ KERNEL(stft_ref)( barrier(CLK_LOCAL_MEM_FENCE); - // FT from def for single freq for given frame: - cfloat freq_val = czero(); + const size_t max_freq_for_this_block = min(freq_start + FREQ_PER_BLOCK, FREQS); - // dft_power = 2*PI*(k/N) from dft def. - const float dft_power = 2.0f * M_PI_F * (float)freq_id / (float)frame_size; + // Currently each sub group calcs 4 freq_id at the same time + for(size_t freq_id = get_sub_group_id()*FREQS_PER_THREAD + freq_start; freq_id < max_freq_for_this_block; freq_id += get_num_sub_groups()*FREQS_PER_THREAD) { - cfloat err = czero(); - for(int i = 0; i < window_size; ++i) { - const float x_i = x_i_shared[i]; - const cfloat e_i = expmi(dft_power*(float)(i+start_offset)); - const cfloat val_i = crmult(e_i, x_i); + float4 freq_val_real = 0.0f; + float4 freq_val_img = 0.0f; - freq_val = cadd(freq_val, val_i); - } + // // dft_power = 2*PI*(k/N) from dft def. + float4 dft_power = 2.0f * M_PI_F / (float)frame_size; + dft_power.s0 *= (float)(freq_id + 0); + dft_power.s1 *= (float)(freq_id + 1); + dft_power.s2 *= (float)(freq_id + 2); + dft_power.s3 *= (float)(freq_id + 3); + + // sin cos bound(?): Probably there is some external unit to calc sin cos + // which is overloaded with commands(each thread issues 8 such instructions) + // TODO: Implement fft. + for(int i = get_sub_group_local_id(); i < window_size; i+= get_sub_group_size()) { + const float x_i = x_i_shared[i]; + const float4 real = native_cos(dft_power*(float)(i+start_offset))*x_i; + const float4 img = -native_sin(dft_power*(float)(i+start_offset))*x_i; + + freq_val_real += real; + freq_val_img += img; + } + + freq_val_real.s0 = sub_group_reduce_add(freq_val_real.s0); + freq_val_real.s1 = sub_group_reduce_add(freq_val_real.s1); + freq_val_real.s2 = sub_group_reduce_add(freq_val_real.s2); + freq_val_real.s3 = sub_group_reduce_add(freq_val_real.s3); + + freq_val_img.s0 = sub_group_reduce_add(freq_val_img.s0); + freq_val_img.s1 = sub_group_reduce_add(freq_val_img.s1); + freq_val_img.s2 = sub_group_reduce_add(freq_val_img.s2); + freq_val_img.s3 = sub_group_reduce_add(freq_val_img.s3); + + if((freq_id < FREQS) && (get_sub_group_local_id() < 2*min((size_t)FREQS_PER_THREAD, (FREQS - freq_id)))) { #if TRANSPOSE_FRAMES - const int output_real_idx = OUTPUT_GET_INDEX(batch, freq_id, frame_id, 0); - const int output_imag_idx = OUTPUT_GET_INDEX(batch, freq_id, frame_id, 1); + const int output_idx = OUTPUT_GET_INDEX(batch, freq_id + get_sub_group_local_id()/2, frame_id, get_sub_group_local_id() % 2); #else - const int output_real_idx = OUTPUT_GET_INDEX(batch, frame_id, freq_id, 0); - const int output_imag_idx = OUTPUT_GET_INDEX(batch, frame_id, freq_id, 1); + const int output_idx = OUTPUT_GET_INDEX(batch, frame_id, freq_id + get_sub_group_local_id()/2, get_sub_group_local_id() % 2); #endif - - output[output_real_idx] = (OUTPUT_TYPE)real(freq_val); - output[output_imag_idx] = (OUTPUT_TYPE)imag(freq_val); + if ( (get_sub_group_local_id() % 2) == 0) + output[output_idx] = (OUTPUT_TYPE)freq_val_real[get_sub_group_local_id()/2]; + else + output[output_idx] = (OUTPUT_TYPE)freq_val_img[get_sub_group_local_id()/2]; + } + } } \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h index e5c94526a8aedb..8a8de1c99cc8fc 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h @@ -24,7 +24,7 @@ class STFTKernelBase : public KernelBaseOpenCL { using KernelBaseOpenCL::KernelBaseOpenCL; protected: - JitConstants GetJitConstants(const STFT_params& params) const; + virtual JitConstants GetJitConstants(const STFT_params& params) const; virtual CommonDispatchData CalcLaunchConfig(const STFT_params& params) const; KernelsData GetCommonKernelsData(const Params& params) const; void GetUpdateDispatchDataFunc(KernelData& kd) const override; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp index b46a51f366f33a..72469b0d315635 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp @@ -4,6 +4,9 @@ #include "stft_kernel_opt.h" +const size_t FREQ_PER_BLOCK = 256; +const size_t X_I_MAX_BUFFER_SIZE = 2048; +const size_t THREADS_PER_BLOCK = 256; namespace kernel_selector { ParamsKey STFTKernelOpt::GetSupportedKey() const { ParamsKey k; @@ -26,6 +29,15 @@ ParamsKey STFTKernelOpt::GetSupportedKey() const { return k; } +JitConstants STFTKernelOpt::GetJitConstants(const STFT_params& params) const { + JitConstants jit = STFTKernelBase::GetJitConstants(params); + + jit.AddConstants({MakeJitConstant("FREQ_PER_BLOCK", FREQ_PER_BLOCK)}); + jit.AddConstants({MakeJitConstant("X_I_MAX_BUFFER_SIZE", X_I_MAX_BUFFER_SIZE)}); + + return jit; +} + KernelsData STFTKernelOpt::GetKernelsData(const Params& params) const { return GetCommonKernelsData(params); } @@ -41,20 +53,17 @@ CommonDispatchData STFTKernelOpt::CalcLaunchConfig(const STFT_params& params) co OPENVINO_ASSERT(output.Dimentions() == 4); OPENVINO_ASSERT(output.X().v == 2); - std::vector> dimsByGws; - - if (params.transpose_frames) { - dispatchData.gws = {output.Feature().v, output.Y().v, output.Batch().v}; - } else { - dispatchData.gws = {output.Y().v, output.Feature().v, output.Batch().v}; - } + const size_t freqSize = params.transpose_frames ? output.Feature().v : output.Y().v; + const int blocksPerFreq = (freqSize + FREQ_PER_BLOCK-1)/FREQ_PER_BLOCK; - const int wantedThreadsPerBlock = 128; - const size_t threads = dispatchData.gws[0] < wantedThreadsPerBlock ? dispatchData.gws[0] : wantedThreadsPerBlock; + const size_t framesSize = params.transpose_frames ? output.Y().v : output.Feature().v; + const size_t batchSize = output.Batch().v; - dispatchData.lws = {threads, 1, 1}; + dispatchData.lws = {1, THREADS_PER_BLOCK}; + dispatchData.gws = {batchSize, framesSize * THREADS_PER_BLOCK * blocksPerFreq}; - //std::cout << dispatchData << std::endl; + std::cout << dispatchData << std::endl; + std::cout << "Blocks: " << dispatchData.gws[1]/dispatchData.lws[1] << std::endl; return dispatchData; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h index e338fd685e6cf4..0758c50f23157b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h @@ -11,6 +11,7 @@ class STFTKernelOpt : public STFTKernelBase { public: STFTKernelOpt() : STFTKernelBase("stft_opt") {} + JitConstants GetJitConstants(const STFT_params& params) const override; KernelsData GetKernelsData(const Params& params) const override; KernelsPriority GetKernelsPriority(const Params& params) const override; CommonDispatchData CalcLaunchConfig(const STFT_params& params) const override; From b5a9abe19087b4e48221d56d1ede43fa8a0d1733 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Tue, 21 Jan 2025 10:05:25 +0000 Subject: [PATCH 08/14] [gpu]: stft: stft_opt kernel is enabled only in cases which are supported. --- .../kernel_selector/cl_kernels/stft_opt.cl | 2 +- .../kernels/stft/stft_kernel_base.cpp | 9 ++++++- .../kernels/stft/stft_kernel_base.h | 1 + .../kernels/stft/stft_kernel_opt.cpp | 24 +++++++++++++++---- .../kernels/stft/stft_kernel_opt.h | 1 + 5 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl index 87fe5696f6d646..4450c32c854a11 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl @@ -27,7 +27,7 @@ KERNEL(stft_ref)( const size_t frame_step = (size_t)frame_step_buff[0]; const size_t window_size = INPUT1_SIZE_X; - __local float x_i_shared[X_I_MAX_BUFFER_SIZE]; + __local float x_i_shared[SHARED_X_I_BUFFER_SIZE]; const size_t block_size = get_local_size(0)*get_local_size(1)*get_local_size(2); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.cpp index dd5cc745bff473..08bf9a822b3697 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.cpp @@ -57,7 +57,9 @@ CommonDispatchData STFTKernelBase::CalcLaunchConfig(const STFT_params& params) c } KernelsData STFTKernelBase::GetCommonKernelsData(const Params& params) const { - assert(params.GetType() == KernelType::STFT); + if (!Validate(params)) { + return {}; + } const auto& prim_params = static_cast(params); @@ -87,4 +89,9 @@ KernelsData STFTKernelBase::GetCommonKernelsData(const Params& params) const { return {k_data}; } + +bool STFTKernelBase::Validate(const Params& p) const { + return p.GetType() == KernelType::STFT; +} + } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h index 8a8de1c99cc8fc..adb402308bd7b7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_base.h @@ -28,5 +28,6 @@ class STFTKernelBase : public KernelBaseOpenCL { virtual CommonDispatchData CalcLaunchConfig(const STFT_params& params) const; KernelsData GetCommonKernelsData(const Params& params) const; void GetUpdateDispatchDataFunc(KernelData& kd) const override; + bool Validate(const Params& p) const override; }; } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp index 72469b0d315635..e9e322a0411f39 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp @@ -5,7 +5,7 @@ #include "stft_kernel_opt.h" const size_t FREQ_PER_BLOCK = 256; -const size_t X_I_MAX_BUFFER_SIZE = 2048; +const size_t STATIC_MAX_X_I_BUFFER = 1024; const size_t THREADS_PER_BLOCK = 256; namespace kernel_selector { ParamsKey STFTKernelOpt::GetSupportedKey() const { @@ -33,7 +33,10 @@ JitConstants STFTKernelOpt::GetJitConstants(const STFT_params& params) const { JitConstants jit = STFTKernelBase::GetJitConstants(params); jit.AddConstants({MakeJitConstant("FREQ_PER_BLOCK", FREQ_PER_BLOCK)}); - jit.AddConstants({MakeJitConstant("X_I_MAX_BUFFER_SIZE", X_I_MAX_BUFFER_SIZE)}); + jit.AddConstants({MakeJitConstant("STATIC_MAX_X_I_BUFFER", STATIC_MAX_X_I_BUFFER)}); + + const auto xiMaxBuffer = params.is_shape_agnostic ? "STATIC_MAX_X_I_BUFFER" : "INPUT1_SIZE_X"; + jit.AddConstants({MakeJitConstant("SHARED_X_I_BUFFER_SIZE", xiMaxBuffer)}); return jit; } @@ -54,7 +57,7 @@ CommonDispatchData STFTKernelOpt::CalcLaunchConfig(const STFT_params& params) co OPENVINO_ASSERT(output.X().v == 2); const size_t freqSize = params.transpose_frames ? output.Feature().v : output.Y().v; - const int blocksPerFreq = (freqSize + FREQ_PER_BLOCK-1)/FREQ_PER_BLOCK; + const size_t blocksPerFreq = (freqSize + FREQ_PER_BLOCK - 1) / FREQ_PER_BLOCK; const size_t framesSize = params.transpose_frames ? output.Y().v : output.Feature().v; const size_t batchSize = output.Batch().v; @@ -63,8 +66,21 @@ CommonDispatchData STFTKernelOpt::CalcLaunchConfig(const STFT_params& params) co dispatchData.gws = {batchSize, framesSize * THREADS_PER_BLOCK * blocksPerFreq}; std::cout << dispatchData << std::endl; - std::cout << "Blocks: " << dispatchData.gws[1]/dispatchData.lws[1] << std::endl; + std::cout << "Blocks: " << dispatchData.gws[1] / dispatchData.lws[1] << std::endl; return dispatchData; } +bool STFTKernelOpt::Validate(const Params& p) const { + if (STFTKernelBase::Validate(p) == false) + return false; + + const auto& params = static_cast(p); + const auto windowSize = params.inputs[1].LogicalSize(); + + if (params.is_shape_agnostic && windowSize > STATIC_MAX_X_I_BUFFER) + return false; + + return true; +} + } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h index 0758c50f23157b..455d780c356a4a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h @@ -16,5 +16,6 @@ class STFTKernelOpt : public STFTKernelBase { KernelsPriority GetKernelsPriority(const Params& params) const override; CommonDispatchData CalcLaunchConfig(const STFT_params& params) const override; ParamsKey GetSupportedKey() const override; + bool Validate(const Params& p) const override; }; } // namespace kernel_selector From facb700c837418d35c2b47168e801d5c13095175 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Tue, 21 Jan 2025 12:10:43 +0000 Subject: [PATCH 09/14] [gpu]: stft: Removed debug staff and disabled benchmark. --- .../src/kernel_selector/kernels/stft/stft_kernel_opt.cpp | 4 +--- .../intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp index e9e322a0411f39..330aba7c6c1951 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp @@ -5,7 +5,7 @@ #include "stft_kernel_opt.h" const size_t FREQ_PER_BLOCK = 256; -const size_t STATIC_MAX_X_I_BUFFER = 1024; +const size_t STATIC_MAX_X_I_BUFFER = 2048; const size_t THREADS_PER_BLOCK = 256; namespace kernel_selector { ParamsKey STFTKernelOpt::GetSupportedKey() const { @@ -65,8 +65,6 @@ CommonDispatchData STFTKernelOpt::CalcLaunchConfig(const STFT_params& params) co dispatchData.lws = {1, THREADS_PER_BLOCK}; dispatchData.gws = {batchSize, framesSize * THREADS_PER_BLOCK * blocksPerFreq}; - std::cout << dispatchData << std::endl; - std::cout << "Blocks: " << dispatchData.gws[1] / dispatchData.lws[1] << std::endl; return dispatchData; } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp index 6ef1a97b2320b3..bea020643058f6 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp @@ -159,7 +159,7 @@ class stft_benchmark : public ::testing::Test { }; } // namespace -TEST_F(stft_benchmark, benchmarks) { +TEST_F(stft_benchmark, DISABLED_benchmarks) { RunBenchmark({10000}, 1000, 2, true); RunBenchmark({10000}, 1000, 2, false); From 9076445a1c695fd2177275abc4f0c72106e5c43e Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Tue, 21 Jan 2025 12:45:52 +0000 Subject: [PATCH 10/14] [gpu]: stft: Vectorized initaial load to shared mem. --- .../kernel_selector/cl_kernels/stft_opt.cl | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl index 4450c32c854a11..0a068a532b5473 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl @@ -37,7 +37,21 @@ KERNEL(stft_ref)( const INPUT0_TYPE* restrict signal_for_this_frame = signal + batch*INPUT0_SIZE_X + frame_id*frame_step + start_offset; // Preload into shared mem: - for(size_t i = get_local_linear_id(); i < window_size; i+= block_size) { + for(size_t i = get_local_linear_id()*4; i < window_size; i+= block_size*4) { + // NOTE: Vectorization by internal unrolling loop, in order to compiler to + // decide it if can use vectorized vectorized instructions, + // which may depend on data type, pointer alignment etc). + #pragma unroll + for(size_t j = 0; j < 4; ++j) { + const float signal_val = (float)signal_for_this_frame[i+j]; + const float window_val = (float)window[i+j]; + x_i_shared[i+j] = signal_val*window_val; + } + } + + // Handle leftovers: + const size_t leftovers_start = window_size%(block_size*4); + for(size_t i = leftovers_start + get_local_linear_id(); i < window_size; i+= block_size*4) { const float signal_val = (float)signal_for_this_frame[i]; const float window_val = (float)window[i]; x_i_shared[i] = signal_val*window_val; @@ -47,22 +61,22 @@ KERNEL(stft_ref)( const size_t max_freq_for_this_block = min(freq_start + FREQ_PER_BLOCK, FREQS); - // Currently each sub group calcs 4 freq_id at the same time + // Currently each sub group calcs 4 freq_id at the same time. for(size_t freq_id = get_sub_group_id()*FREQS_PER_THREAD + freq_start; freq_id < max_freq_for_this_block; freq_id += get_num_sub_groups()*FREQS_PER_THREAD) { float4 freq_val_real = 0.0f; float4 freq_val_img = 0.0f; - // // dft_power = 2*PI*(k/N) from dft def. + // dft_power = 2*PI*(k/N) from dft def. float4 dft_power = 2.0f * M_PI_F / (float)frame_size; dft_power.s0 *= (float)(freq_id + 0); dft_power.s1 *= (float)(freq_id + 1); dft_power.s2 *= (float)(freq_id + 2); dft_power.s3 *= (float)(freq_id + 3); - // sin cos bound(?): Probably there is some external unit to calc sin cos - // which is overloaded with commands(each thread issues 8 such instructions) - // TODO: Implement fft. + // For bigger window_size kernel is sin cos bound: Probably there is some external + // unit to calc sin cos, which is overloaded with commands(each thread issues 8 such instructions). + // TODO: Implement fft for those cases. for(int i = get_sub_group_local_id(); i < window_size; i+= get_sub_group_size()) { const float x_i = x_i_shared[i]; From 2aff9c91deabf1225e8755e8205ccc95fd8e8618 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Fri, 24 Jan 2025 13:40:44 +0100 Subject: [PATCH 11/14] Update src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl Co-authored-by: Pawel Raasz --- .../intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl index 0a068a532b5473..fe85f9cc2c039e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2018-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // From 3be285f421110b25f4a469d44562a55159d5936a Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Fri, 24 Jan 2025 14:19:55 +0100 Subject: [PATCH 12/14] Update src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp Co-authored-by: Pawel Raasz --- .../intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp index bea020643058f6..7313446ba78eea 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/stft_gpu_benchamrk.cpp @@ -1,5 +1,4 @@ -// Copyright (C) 2018-2024 Intel Corporation -// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2018-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // From 55797b6ec9845c998e6ba702755bb3bf961a84b5 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Fri, 24 Jan 2025 14:20:01 +0100 Subject: [PATCH 13/14] Update src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp Co-authored-by: Pawel Raasz --- .../src/kernel_selector/kernels/stft/stft_kernel_opt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp index 330aba7c6c1951..808be5491dc311 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2018-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // From 864027d93e95010108a281adf84149ab4e6a1db7 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Fri, 24 Jan 2025 14:20:09 +0100 Subject: [PATCH 14/14] Update src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h Co-authored-by: Pawel Raasz --- .../src/kernel_selector/kernels/stft/stft_kernel_opt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h index 455d780c356a4a..026ef51b075728 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/stft/stft_kernel_opt.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2018-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 //