12
12
#if SUBGROUP_BLOCK_SIZE == 1
13
13
#define BLOCK_READ (ptr , offset ) DT_INPUT_BLOCK_READ(ptr, offset)
14
14
#define BLOCK_WRITE (ptr , offset , val ) DT_OUTPUT_BLOCK_WRITE(ptr, offset, val)
15
- #define INPUT_VEC_TYPE INPUT0_TYPE
15
+ #define ACC_TYPE ACCUMULATOR_TYPE
16
+ #define TO_ACC_TYPE (x ) TO_ACCUMULATOR_TYPE(x)
16
17
#define OUTPUT_VEC_TYPE OUTPUT_TYPE
17
18
#else
18
19
#define BLOCK_READ (ptr , offset ) CAT(DT_INPUT_BLOCK_READ, SUBGROUP_BLOCK_SIZE)(ptr, offset)
19
20
#define BLOCK_WRITE (ptr , offset , val ) CAT(DT_OUTPUT_BLOCK_WRITE, SUBGROUP_BLOCK_SIZE)(ptr, offset, val)
20
- #define INPUT_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, SUBGROUP_BLOCK_SIZE)
21
+ #define ACC_TYPE MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, SUBGROUP_BLOCK_SIZE)
22
+ #define TO_ACC_TYPE (x ) CAT(convert_, ACC_TYPE)(x)
21
23
#define OUTPUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, SUBGROUP_BLOCK_SIZE)
22
24
#endif
23
25
@@ -38,7 +40,7 @@ KERNEL(rms_gpu_bfyx_opt)(
38
40
const uint data_offset = data_idx * data_size ;
39
41
const uint subgroup_offset = get_sub_group_id () * get_sub_group_size () * items_num ;
40
42
41
- INPUT0_TYPE data [STACK_SIZE ];
43
+ ACCUMULATOR_TYPE data [STACK_SIZE ];
42
44
ACCUMULATOR_TYPE rms = ACCUMULATOR_VAL_ZERO ;
43
45
44
46
__local ACCUMULATOR_TYPE slm_buf [SLM_SIZE ];
@@ -48,15 +50,15 @@ KERNEL(rms_gpu_bfyx_opt)(
48
50
{
49
51
for (; i < items_num - (items_num % SUBGROUP_BLOCK_SIZE ); i += SUBGROUP_BLOCK_SIZE )
50
52
{
51
- INPUT_VEC_TYPE vec_tmp = BLOCK_READ (input , data_offset + subgroup_offset + i * get_sub_group_size ());
53
+ ACC_TYPE vec_tmp = TO_ACC_TYPE ( BLOCK_READ (input , data_offset + subgroup_offset + i * get_sub_group_size () ));
52
54
#if SUBGROUP_BLOCK_SIZE == 1
53
- rms += TO_ACCUMULATOR_TYPE ( native_powr (vec_tmp , 2 ) );
55
+ rms += native_powr (vec_tmp , 2 );
54
56
data [i ] = vec_tmp ;
55
57
#else
56
58
unroll_for (int j = 0 ; j < SUBGROUP_BLOCK_SIZE ; j ++ )
57
59
{
58
- INPUT0_TYPE tmp = vec_tmp [j ];
59
- rms += TO_ACCUMULATOR_TYPE ( native_powr (tmp , 2 ) );
60
+ ACCUMULATOR_TYPE tmp = vec_tmp [j ];
61
+ rms += native_powr (tmp , 2 );
60
62
data [i + j ] = tmp ;
61
63
}
62
64
#endif
@@ -65,15 +67,15 @@ KERNEL(rms_gpu_bfyx_opt)(
65
67
66
68
for (; i < items_num ; i ++ )
67
69
{
68
- INPUT0_TYPE tmp = input [data_offset + subgroup_offset + get_sub_group_local_id () + i * get_sub_group_size ()];
69
- rms += TO_ACCUMULATOR_TYPE ( native_powr (tmp , 2 ) );
70
+ ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE ( input [data_offset + subgroup_offset + get_sub_group_local_id () + i * get_sub_group_size ()]) ;
71
+ rms += native_powr (tmp , 2 );
70
72
data [i ] = tmp ;
71
73
}
72
74
73
75
if (in_data_idx < leftovers )
74
76
{
75
- INPUT0_TYPE tmp = input [data_offset + workers_per_data * items_num + in_data_idx ];
76
- rms += TO_ACCUMULATOR_TYPE ( native_powr (tmp , 2 ) );
77
+ ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE ( input [data_offset + workers_per_data * items_num + in_data_idx ]) ;
78
+ rms += native_powr (tmp , 2 );
77
79
data [items_num ] = tmp ;
78
80
}
79
81
@@ -103,32 +105,33 @@ KERNEL(rms_gpu_bfyx_opt)(
103
105
{
104
106
for (; i < items_num - (items_num % SUBGROUP_BLOCK_SIZE ); i += SUBGROUP_BLOCK_SIZE )
105
107
{
106
- INPUT_VEC_TYPE vec_gamma = BLOCK_READ (gamma , subgroup_offset + i * get_sub_group_size ());
108
+ ACC_TYPE vec_gamma = TO_ACC_TYPE ( BLOCK_READ (gamma , subgroup_offset + i * get_sub_group_size () ));
107
109
OUTPUT_VEC_TYPE vec_tmp ;
108
110
#if SUBGROUP_BLOCK_SIZE == 1
109
- vec_tmp = TO_OUTPUT_TYPE (rms * TO_ACCUMULATOR_TYPE ( data [i ]) * TO_ACCUMULATOR_TYPE ( vec_gamma ) );
111
+ vec_tmp = TO_OUTPUT_TYPE (rms * data [i ] * vec_gamma );
110
112
#else
111
113
unroll_for (int j = 0 ; j < SUBGROUP_BLOCK_SIZE ; j ++ )
112
- vec_tmp [j ] = TO_OUTPUT_TYPE (rms * TO_ACCUMULATOR_TYPE ( data [i + j ]) * TO_ACCUMULATOR_TYPE ( vec_gamma [j ]) );
114
+ vec_tmp [j ] = TO_OUTPUT_TYPE (rms * data [i + j ] * vec_gamma [j ]);
113
115
#endif
114
116
BLOCK_WRITE (output , data_offset + subgroup_offset + i * get_sub_group_size (), vec_tmp );
115
117
}
116
118
}
117
119
118
120
for (; i < items_num ; i ++ )
119
121
{
120
- INPUT1_TYPE temp = gamma [subgroup_offset + get_sub_group_local_id () + i * get_sub_group_size ()];
121
- output [data_offset + subgroup_offset + get_sub_group_local_id () + i * get_sub_group_size ()] = TO_OUTPUT_TYPE (rms * TO_ACCUMULATOR_TYPE ( data [i ]) * TO_ACCUMULATOR_TYPE ( temp ) );
122
+ ACCUMULATOR_TYPE temp = TO_ACCUMULATOR_TYPE ( gamma [subgroup_offset + get_sub_group_local_id () + i * get_sub_group_size ()]) ;
123
+ output [data_offset + subgroup_offset + get_sub_group_local_id () + i * get_sub_group_size ()] = TO_OUTPUT_TYPE (rms * data [i ] * temp );
122
124
}
123
125
124
126
if (in_data_idx < leftovers )
125
127
{
126
- INPUT1_TYPE temp = gamma [workers_per_data * items_num + in_data_idx ];
127
- output [data_offset + workers_per_data * items_num + in_data_idx ] = TO_OUTPUT_TYPE (rms * TO_ACCUMULATOR_TYPE ( data [items_num ]) * TO_ACCUMULATOR_TYPE ( temp ) );
128
+ ACCUMULATOR_TYPE temp = TO_ACCUMULATOR_TYPE ( gamma [workers_per_data * items_num + in_data_idx ]) ;
129
+ output [data_offset + workers_per_data * items_num + in_data_idx ] = TO_OUTPUT_TYPE (rms * data [items_num ] * temp );
128
130
}
129
131
}
130
132
#undef USE_BLOCK_WRITE
131
133
#undef BLOCK_READ
132
134
#undef BLOCK_WRITE
133
- #undef INPUT_VEC_TYPE
135
+ #undef ACC_TYPE
136
+ #undef TO_ACC_TYPE
134
137
#undef OUTPUT_VEC_TYPE
0 commit comments