Skip to content

Commit 4fdf29b

Browse files
committed
[feat] Changed the block size for better throughput
1 parent 05c178e commit 4fdf29b

File tree

1 file changed

+15
-17
lines changed

1 file changed

+15
-17
lines changed

src/layer.cu

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include "float.h"
33
//#include <nvToolsExt.h>
44

5-
#define BLOCK_SIZE 32
5+
#define BLOCK_SIZE 128
66

77
/* ReLU
88
* @param [in & out] inout: [N]
@@ -183,12 +183,12 @@ void Conv1D_ReLU_Stream_CUDA(Tensor *in,
183183
cudaStreamCreate(&s2);
184184
cudaStreamCreate(&s3);
185185

186-
dim3 blockDim = 32;
186+
dim3 blockDim = BLOCK_SIZE;
187187

188-
Conv1D_ReLU_Batch_Kernel<<<(((B * c0_OC * c0_os) + 32 - 1) / 32), blockDim, 0, s0>>>(in->gbuf, conv0_w->gbuf, conv0_b->gbuf, conv0_a->gbuf, B, C, s, c0_OC, c0_K);
189-
Conv1D_ReLU_Batch_Kernel<<<(((B * c1_OC * c1_os) + 32 - 1) / 32), blockDim, 0, s1>>>(in->gbuf, conv1_w->gbuf, conv1_b->gbuf, conv1_a->gbuf, B, C, s, c1_OC, c1_K);
190-
Conv1D_ReLU_Batch_Kernel<<<(((B * c2_OC * c2_os) + 32 - 1) / 32), blockDim, 0, s2>>>(in->gbuf, conv2_w->gbuf, conv2_b->gbuf, conv2_a->gbuf, B, C, s, c2_OC, c2_K);
191-
Conv1D_ReLU_Batch_Kernel<<<(((B * c3_OC * c3_os) + 32 - 1) / 32), blockDim, 0, s3>>>(in->gbuf, conv3_w->gbuf, conv3_b->gbuf, conv3_a->gbuf, B, C, s, c3_OC, c3_K);
188+
Conv1D_ReLU_Batch_Kernel<<<(((B * c0_OC * c0_os) + BLOCK_SIZE - 1) / BLOCK_SIZE), blockDim, 0, s0>>>(in->gbuf, conv0_w->gbuf, conv0_b->gbuf, conv0_a->gbuf, B, C, s, c0_OC, c0_K);
189+
Conv1D_ReLU_Batch_Kernel<<<(((B * c1_OC * c1_os) + BLOCK_SIZE - 1) / BLOCK_SIZE), blockDim, 0, s1>>>(in->gbuf, conv1_w->gbuf, conv1_b->gbuf, conv1_a->gbuf, B, C, s, c1_OC, c1_K);
190+
Conv1D_ReLU_Batch_Kernel<<<(((B * c2_OC * c2_os) + BLOCK_SIZE - 1) / BLOCK_SIZE), blockDim, 0, s2>>>(in->gbuf, conv2_w->gbuf, conv2_b->gbuf, conv2_a->gbuf, B, C, s, c2_OC, c2_K);
191+
Conv1D_ReLU_Batch_Kernel<<<(((B * c3_OC * c3_os) + BLOCK_SIZE - 1) / BLOCK_SIZE), blockDim, 0, s3>>>(in->gbuf, conv3_w->gbuf, conv3_b->gbuf, conv3_a->gbuf, B, C, s, c3_OC, c3_K);
192192
CHECK_CUDA(cudaDeviceSynchronize());
193193
cudaStreamDestroy(s0);
194194
cudaStreamDestroy(s1);
@@ -275,12 +275,12 @@ void GetMax_Stream_CUDA(
275275
cudaStreamCreate(&s2);
276276
cudaStreamCreate(&s3);
277277

278-
dim3 blockDim = 32;
278+
dim3 blockDim = BLOCK_SIZE;
279279

280-
GetMax_Batch_Kernel<<<(B0 * c0_C + 32 - 1) / 32, blockDim, 0, s0>>>(conv0_a->gbuf, pool0_a->gbuf, B0, c0_C, c0_s);
281-
GetMax_Batch_Kernel<<<(B1 * c1_C + 32 - 1) / 32, blockDim, 0, s1>>>(conv1_a->gbuf, pool1_a->gbuf, B1, c1_C, c1_s);
282-
GetMax_Batch_Kernel<<<(B2 * c2_C + 32 - 1) / 32, blockDim, 0, s2>>>(conv2_a->gbuf, pool2_a->gbuf, B2, c2_C, c2_s);
283-
GetMax_Batch_Kernel<<<(B3 * c3_C + 32 - 1) / 32, blockDim, 0, s3>>>(conv3_a->gbuf, pool3_a->gbuf, B3, c3_C, c3_s);
280+
GetMax_Batch_Kernel<<<(B0 * c0_C + BLOCK_SIZE - 1) / BLOCK_SIZE, blockDim, 0, s0>>>(conv0_a->gbuf, pool0_a->gbuf, B0, c0_C, c0_s);
281+
GetMax_Batch_Kernel<<<(B1 * c1_C + BLOCK_SIZE - 1) / BLOCK_SIZE, blockDim, 0, s1>>>(conv1_a->gbuf, pool1_a->gbuf, B1, c1_C, c1_s);
282+
GetMax_Batch_Kernel<<<(B2 * c2_C + BLOCK_SIZE - 1) / BLOCK_SIZE, blockDim, 0, s2>>>(conv2_a->gbuf, pool2_a->gbuf, B2, c2_C, c2_s);
283+
GetMax_Batch_Kernel<<<(B3 * c3_C + BLOCK_SIZE - 1) / BLOCK_SIZE, blockDim, 0, s3>>>(conv3_a->gbuf, pool3_a->gbuf, B3, c3_C, c3_s);
284284
CHECK_CUDA(cudaDeviceSynchronize());
285285
cudaStreamDestroy(s0);
286286
cudaStreamDestroy(s1);
@@ -360,8 +360,8 @@ void Concat_CUDA(Tensor *in1, Tensor *in2, Tensor *in3, Tensor *in4, Tensor *out
360360
size_t N3 = in3->shape[1];
361361
size_t N4 = in4->shape[1];
362362

363-
dim3 blockDim = 32;
364-
dim3 gridDim = (B * (N1 + N2 + N3 + N4) + 32 - 1) / 32;
363+
dim3 blockDim = BLOCK_SIZE;
364+
dim3 gridDim = (B * (N1 + N2 + N3 + N4) + BLOCK_SIZE - 1) / BLOCK_SIZE;
365365
Concat_Batch_Kernel<<<gridDim, blockDim>>>(in1->gbuf, in2->gbuf, in3->gbuf, in4->gbuf, out->gbuf, B, N1, N2, N3, N4);
366366
CHECK_CUDA(cudaDeviceSynchronize());
367367
}
@@ -458,6 +458,7 @@ void Linear(Tensor *in, Tensor *w, Tensor *b, Tensor *out) {
458458
// CHECK_CUDA(cudaDeviceSynchronize());
459459
// }
460460

461+
//NOTE: 여기 아래부터는 blockDim = 32가 더 빠름
461462
//MARK: L_Kernel
462463
__global__ void Linear_Kernel(float *in, float *w, float *b, float *out, size_t N, size_t M) {
463464
int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -541,7 +542,7 @@ void Linear_Stream_CUDA(Tensor *in,
541542

542543
size_t B = in->shape[0];
543544
size_t N = in->shape[1];
544-
// size_t M는 activation이 다 다름
545+
//NOTE: size_t M는 activation이 다 다름
545546

546547
dim3 blockDim = 32;
547548
Linear_Batch_Kernel<<<(B * gate_a->shape[1] + 32 - 1) / 32, blockDim, 0, s0>>>(in->gbuf, gate_w->gbuf, gate_b->gbuf, gate_a->gbuf, B, N, gate_w->shape[0]);
@@ -607,9 +608,7 @@ __global__ void Softmax_Batch_Kernel(float *inout, size_t B, size_t N) {
607608
if (idx >= B * N) return;
608609

609610
size_t bi = idx / N; // 배치 인덱스
610-
//size_t li = idx % N; // 배치 내의 인덱스
611611

612-
//아래의 코드를 바꿔줘.
613612
float max_val = -INFINITY;
614613
for (size_t i = 0; i < N; i++) {
615614
max_val = fmaxf(max_val, inout[bi * N + i]);
@@ -688,7 +687,6 @@ void Scaling_Stream_CUDA(Tensor *expert0_a, Tensor *expert1_a, Tensor *expert2_a
688687
dim3 blockDim = 32;
689688
dim3 gridDim = (B * expert0_a->shape[1] + 32 - 1) / 32;
690689

691-
// 여기서 원래 gate_a->buf[0]를 하면돼었었는데 이제는 배치가 추가되어서 그렇게 못함. 어떻게 해야 하지?
692690
Scaling_Batch_Kernel<<<gridDim, blockDim, 0, s0>>>(expert0_a->gbuf, B, expert0_a->shape[1], 4, gate_a->buf+0);
693691
Scaling_Batch_Kernel<<<gridDim, blockDim, 0, s1>>>(expert1_a->gbuf, B, expert1_a->shape[1], 4, gate_a->buf+1);
694692
Scaling_Batch_Kernel<<<gridDim, blockDim, 0, s2>>>(expert2_a->gbuf, B, expert2_a->shape[1], 4, gate_a->buf+2);

0 commit comments

Comments
 (0)