-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnorm_kernel.cu
66 lines (57 loc) · 1.8 KB
/
norm_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <algorithm>
#include <vector>
#include <cmath>
#include "ops.h"
#include "util.cuh"
template <typename scalar_t>
__global__ void layernorm_inplace_kernel(
scalar_t* input,
scalar_t* weight,
scalar_t* bias,
float eps,
int hidden_size
) {
int thread_id = threadIdx.x;
int block_id = blockIdx.x;
float mean_sum = 0.0f;
float sq_sum = 0.0f;
for(int i = thread_id; i < hidden_size; i += blockDim.x) {
mean_sum += (float)(input[hidden_size * block_id + i]);
sq_sum += (float)(input[hidden_size * block_id + i] * input[hidden_size * block_id + i]);
}
mean_sum = blockReduceSum(mean_sum) / hidden_size;
sq_sum = blockReduceSum(sq_sum) / hidden_size;
float s_var = rsqrtf(eps + sq_sum - mean_sum * mean_sum);
// __syncthreads();
for(int i = thread_id; i < hidden_size; i += blockDim.x) {
// 여기 뭔 연산을 넣으면 커널이 뻗음... 왜지?
}
printf("done!\n");
}
void layernorm_inplace(
torch::Tensor& input,
torch::Tensor& weight,
torch::Tensor& bias,
float eps) {
int hidden_size = input.size(-1);
int num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "layernorm_inplace_kernel",
([&] {
layernorm_inplace_kernel<scalar_t><<<grid, block, 0, stream>>>(
input.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
bias.data_ptr<scalar_t>(),
eps,
hidden_size);
})
);
}