Skip to content

Commit a0a037a

Browse files
committed
[GPU] Using accumulator dtype for computations in rms_gpu_bfyx_opt
1 parent fc31f55 commit a0a037a

File tree

1 file changed

+22
-19
lines changed

1 file changed

+22
-19
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rms_gpu_bfyx_opt.cl

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,14 @@
1212
#if SUBGROUP_BLOCK_SIZE == 1
1313
#define BLOCK_READ(ptr, offset) DT_INPUT_BLOCK_READ(ptr, offset)
1414
#define BLOCK_WRITE(ptr, offset, val) DT_OUTPUT_BLOCK_WRITE(ptr, offset, val)
15-
#define INPUT_VEC_TYPE INPUT0_TYPE
15+
#define ACC_TYPE ACCUMULATOR_TYPE
16+
#define TO_ACC_TYPE(x) TO_ACCUMULATOR_TYPE(x)
1617
#define OUTPUT_VEC_TYPE OUTPUT_TYPE
1718
#else
1819
#define BLOCK_READ(ptr, offset) CAT(DT_INPUT_BLOCK_READ, SUBGROUP_BLOCK_SIZE)(ptr, offset)
1920
#define BLOCK_WRITE(ptr, offset, val) CAT(DT_OUTPUT_BLOCK_WRITE, SUBGROUP_BLOCK_SIZE)(ptr, offset, val)
20-
#define INPUT_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, SUBGROUP_BLOCK_SIZE)
21+
#define ACC_TYPE MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, SUBGROUP_BLOCK_SIZE)
22+
#define TO_ACC_TYPE(x) CAT(convert_, ACC_TYPE)(x)
2123
#define OUTPUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, SUBGROUP_BLOCK_SIZE)
2224
#endif
2325

@@ -38,7 +40,7 @@ KERNEL(rms_gpu_bfyx_opt)(
3840
const uint data_offset = data_idx * data_size;
3941
const uint subgroup_offset = get_sub_group_id() * get_sub_group_size() * items_num;
4042

41-
INPUT0_TYPE data[STACK_SIZE];
43+
ACCUMULATOR_TYPE data[STACK_SIZE];
4244
ACCUMULATOR_TYPE rms = ACCUMULATOR_VAL_ZERO;
4345

4446
__local ACCUMULATOR_TYPE slm_buf[SLM_SIZE];
@@ -48,15 +50,15 @@ KERNEL(rms_gpu_bfyx_opt)(
4850
{
4951
for (; i < items_num - (items_num % SUBGROUP_BLOCK_SIZE); i += SUBGROUP_BLOCK_SIZE)
5052
{
51-
INPUT_VEC_TYPE vec_tmp = BLOCK_READ(input, data_offset + subgroup_offset + i * get_sub_group_size());
53+
ACC_TYPE vec_tmp = TO_ACC_TYPE(BLOCK_READ(input, data_offset + subgroup_offset + i * get_sub_group_size()));
5254
#if SUBGROUP_BLOCK_SIZE == 1
53-
rms += TO_ACCUMULATOR_TYPE(native_powr(vec_tmp, 2));
55+
rms += native_powr(vec_tmp, 2);
5456
data[i] = vec_tmp;
5557
#else
5658
unroll_for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
5759
{
58-
INPUT0_TYPE tmp = vec_tmp[j];
59-
rms += TO_ACCUMULATOR_TYPE(native_powr(tmp, 2));
60+
ACCUMULATOR_TYPE tmp = vec_tmp[j];
61+
rms += native_powr(tmp, 2);
6062
data[i + j] = tmp;
6163
}
6264
#endif
@@ -65,15 +67,15 @@ KERNEL(rms_gpu_bfyx_opt)(
6567

6668
for (; i < items_num; i++)
6769
{
68-
INPUT0_TYPE tmp = input[data_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()];
69-
rms += TO_ACCUMULATOR_TYPE(native_powr(tmp, 2));
70+
ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[data_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()]);
71+
rms += native_powr(tmp, 2);
7072
data[i] = tmp;
7173
}
7274

7375
if (in_data_idx < leftovers)
7476
{
75-
INPUT0_TYPE tmp = input[data_offset + workers_per_data * items_num + in_data_idx];
76-
rms += TO_ACCUMULATOR_TYPE(native_powr(tmp, 2));
77+
ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[data_offset + workers_per_data * items_num + in_data_idx]);
78+
rms += native_powr(tmp, 2);
7779
data[items_num] = tmp;
7880
}
7981

@@ -103,32 +105,33 @@ KERNEL(rms_gpu_bfyx_opt)(
103105
{
104106
for (; i < items_num - (items_num % SUBGROUP_BLOCK_SIZE); i += SUBGROUP_BLOCK_SIZE)
105107
{
106-
INPUT_VEC_TYPE vec_gamma = BLOCK_READ(gamma, subgroup_offset + i * get_sub_group_size());
108+
ACC_TYPE vec_gamma = TO_ACC_TYPE(BLOCK_READ(gamma, subgroup_offset + i * get_sub_group_size()));
107109
OUTPUT_VEC_TYPE vec_tmp;
108110
#if SUBGROUP_BLOCK_SIZE == 1
109-
vec_tmp = TO_OUTPUT_TYPE(rms * TO_ACCUMULATOR_TYPE(data[i]) * TO_ACCUMULATOR_TYPE(vec_gamma));
111+
vec_tmp = TO_OUTPUT_TYPE(rms * data[i] * vec_gamma);
110112
#else
111113
unroll_for (int j = 0; j < SUBGROUP_BLOCK_SIZE; j++)
112-
vec_tmp[j] = TO_OUTPUT_TYPE(rms * TO_ACCUMULATOR_TYPE(data[i + j]) * TO_ACCUMULATOR_TYPE(vec_gamma[j]));
114+
vec_tmp[j] = TO_OUTPUT_TYPE(rms * data[i + j] * vec_gamma[j]);
113115
#endif
114116
BLOCK_WRITE(output, data_offset + subgroup_offset + i * get_sub_group_size(), vec_tmp);
115117
}
116118
}
117119

118120
for (; i < items_num; i++)
119121
{
120-
INPUT1_TYPE temp = gamma[subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()];
121-
output[data_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()] = TO_OUTPUT_TYPE(rms * TO_ACCUMULATOR_TYPE(data[i]) * TO_ACCUMULATOR_TYPE(temp));
122+
ACCUMULATOR_TYPE temp = TO_ACCUMULATOR_TYPE(gamma[subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()]);
123+
output[data_offset + subgroup_offset + get_sub_group_local_id() + i * get_sub_group_size()] = TO_OUTPUT_TYPE(rms * data[i] * temp);
122124
}
123125

124126
if (in_data_idx < leftovers)
125127
{
126-
INPUT1_TYPE temp = gamma[workers_per_data * items_num + in_data_idx];
127-
output[data_offset + workers_per_data * items_num + in_data_idx] = TO_OUTPUT_TYPE(rms * TO_ACCUMULATOR_TYPE(data[items_num]) * TO_ACCUMULATOR_TYPE(temp));
128+
ACCUMULATOR_TYPE temp = TO_ACCUMULATOR_TYPE(gamma[workers_per_data * items_num + in_data_idx]);
129+
output[data_offset + workers_per_data * items_num + in_data_idx] = TO_OUTPUT_TYPE(rms * data[items_num] * temp);
128130
}
129131
}
130132
#undef USE_BLOCK_WRITE
131133
#undef BLOCK_READ
132134
#undef BLOCK_WRITE
133-
#undef INPUT_VEC_TYPE
135+
#undef ACC_TYPE
136+
#undef TO_ACC_TYPE
134137
#undef OUTPUT_VEC_TYPE

0 commit comments

Comments
 (0)