Skip to content

Commit 9076445

Browse files
committed
[gpu]: stft: Vectorized initaial load to shared mem.
1 parent facb700 commit 9076445

File tree

1 file changed

+20
-6
lines changed
  • src/plugins/intel_gpu/src/kernel_selector/cl_kernels

1 file changed

+20
-6
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/stft_opt.cl

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,21 @@ KERNEL(stft_ref)(
3737
const INPUT0_TYPE* restrict signal_for_this_frame = signal + batch*INPUT0_SIZE_X + frame_id*frame_step + start_offset;
3838

3939
// Preload into shared mem:
40-
for(size_t i = get_local_linear_id(); i < window_size; i+= block_size) {
40+
for(size_t i = get_local_linear_id()*4; i < window_size; i+= block_size*4) {
41+
// NOTE: Vectorization by internal unrolling loop, in order to compiler to
42+
// decide it if can use vectorized vectorized instructions,
43+
// which may depend on data type, pointer alignment etc).
44+
#pragma unroll
45+
for(size_t j = 0; j < 4; ++j) {
46+
const float signal_val = (float)signal_for_this_frame[i+j];
47+
const float window_val = (float)window[i+j];
48+
x_i_shared[i+j] = signal_val*window_val;
49+
}
50+
}
51+
52+
// Handle leftovers:
53+
const size_t leftovers_start = window_size%(block_size*4);
54+
for(size_t i = leftovers_start + get_local_linear_id(); i < window_size; i+= block_size*4) {
4155
const float signal_val = (float)signal_for_this_frame[i];
4256
const float window_val = (float)window[i];
4357
x_i_shared[i] = signal_val*window_val;
@@ -47,22 +61,22 @@ KERNEL(stft_ref)(
4761

4862
const size_t max_freq_for_this_block = min(freq_start + FREQ_PER_BLOCK, FREQS);
4963

50-
// Currently each sub group calcs 4 freq_id at the same time
64+
// Currently each sub group calcs 4 freq_id at the same time.
5165
for(size_t freq_id = get_sub_group_id()*FREQS_PER_THREAD + freq_start; freq_id < max_freq_for_this_block; freq_id += get_num_sub_groups()*FREQS_PER_THREAD) {
5266

5367
float4 freq_val_real = 0.0f;
5468
float4 freq_val_img = 0.0f;
5569

56-
// // dft_power = 2*PI*(k/N) from dft def.
70+
// dft_power = 2*PI*(k/N) from dft def.
5771
float4 dft_power = 2.0f * M_PI_F / (float)frame_size;
5872
dft_power.s0 *= (float)(freq_id + 0);
5973
dft_power.s1 *= (float)(freq_id + 1);
6074
dft_power.s2 *= (float)(freq_id + 2);
6175
dft_power.s3 *= (float)(freq_id + 3);
6276

63-
// sin cos bound(?): Probably there is some external unit to calc sin cos
64-
// which is overloaded with commands(each thread issues 8 such instructions)
65-
// TODO: Implement fft.
77+
// For bigger window_size kernel is sin cos bound: Probably there is some external
78+
// unit to calc sin cos, which is overloaded with commands(each thread issues 8 such instructions).
79+
// TODO: Implement fft for those cases.
6680
for(int i = get_sub_group_local_id(); i < window_size; i+= get_sub_group_size()) {
6781
const float x_i = x_i_shared[i];
6882

0 commit comments

Comments
 (0)