[Bugfix][Kernel] fixed some kernel blocks calculate errors (xlite-dev#23)

DefTruth · web-flow · commit 7bfebb9c4ebd · 2024-09-13T20:45:24.000+08:00
* Update elementwise.cu

* Update relu.cu

* Update elementwise.cu

* Update rms_norm.cu

* Update rms_norm.py

* Update README.md

* Update README.md

* Update sigmoid.cu

* Update relu.cu

* Update softmax.cu

* Update softmax.py

* Update block_all_reduce.cu

* Update dot_product.cu

* Update elementwise.cu

* Update histogram.cu
diff --git a/dot-product/dot_product.cu b/dot-product/dot_product.cu
@@ -13,6 +13,8 @@
 #define WARP_SIZE 32
 #define INT4(value) (reinterpret_cast<int4*>(&(value))[0])
 #define FLOAT4(value) (reinterpret_cast<float4*>(&(value))[0])
+#define HALF2(value) (reinterpret_cast<half2*>(&(value))[0])
+#define BFLOAT2(value) (reinterpret_cast<__nv_bfloat162*>(&(value))[0])
 
 // -------------------------------------- FP32 -------------------------------------- 
 // Warp Reduce Sum
@@ -129,8 +131,8 @@ __global__ void dot_prod_f16x2_f32_kernel(half* a, half* b, float* y, int N) {
   __shared__ float reduce_smem[NUM_WARPS];
 
   // keep the data in register is enougth for warp operaion.
-  half2 reg_a = (reinterpret_cast<half2*>(&(a[idx]))[0]);
-  half2 reg_b = (reinterpret_cast<half2*>(&(b[idx]))[0]);
+  half2 reg_a = HALF2(a[idx]);
+  half2 reg_b = HALF2(b[idx]);
   half prod_f16 = (idx < N) ? __hadd(__hmul(reg_a.x, reg_b.x), 
                                      __hmul(reg_a.y, reg_b.y)) : __float2half(0.0f);
   int warp = tid / WARP_SIZE;
@@ -170,7 +172,7 @@ torch::Tensor dot_prod_##packed_type##_##acc_type(torch::Tensor a, torch::Tensor
   const int N = a.size(0);                                                                \
   CHECK_TORCH_TENSOR_SHAPE(b, N)                                                          \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                            \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;         \
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                             \
   dim3 block(NUM_THREADS_PER_BLOCK);                                                      \
   dim3 grid(NUM_BLOCKS);                                                                  \
   dot_prod_##packed_type##_##acc_type##_kernel<                                           \
diff --git a/elementwise/elementwise.cu b/elementwise/elementwise.cu
@@ -89,7 +89,7 @@ torch::Tensor elementwise_add_##packed_type(torch::Tensor a, torch::Tensor b) {
   CHECK_TORCH_TENSOR_SHAPE(b, N)                                                 \
   auto c = torch::zeros({N}, options);                                           \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;\
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                             \
   dim3 grid(NUM_BLOCKS);                                                         \
   elementwise_add_##packed_type##_kernel<<<grid, block>>>(                       \
@@ -109,7 +109,7 @@ void elementwise_add_##packed_type##_v2(
   CHECK_TORCH_TENSOR_SHAPE(b, N)                                                 \
   CHECK_TORCH_TENSOR_SHAPE(c, N)                                                 \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;\
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                             \
   dim3 grid(NUM_BLOCKS);                                                         \
   elementwise_add_##packed_type##_kernel<<<grid, block>>>(                       \
diff --git a/histogram/histogram.cu b/histogram/histogram.cu
@@ -59,7 +59,7 @@ torch::Tensor histogram_##packed_type(torch::Tensor a) {
   const int M = max_val.item().to<int>();                                        \
   auto y = torch::zeros({M+1}, options);                                         \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;\
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                             \
   dim3 grid(NUM_BLOCKS);                                                         \
   histogram_##packed_type##_kernel<<<grid, block>>>(                             \
@@ -74,4 +74,4 @@ TORCH_BINDING_HIST(i32x4, torch::kInt32, int, 4)
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   TORCH_BINDING_COMMON_EXTENSION(histogram_i32)
   TORCH_BINDING_COMMON_EXTENSION(histogram_i32x4)
-}
+}
diff --git a/reduce/block_all_reduce.cu b/reduce/block_all_reduce.cu
@@ -477,7 +477,7 @@ torch::Tensor block_all_reduce_sum_##packed_type##_##acc_type(torch::Tensor a) {
   auto sum = torch::zeros({1}, options);                                                         \
   const int N = a.size(0);                                                                       \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;                \
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                                             \
   dim3 grid(NUM_BLOCKS);                                                                         \
   block_all_reduce_sum_##packed_type##_##acc_type##_kernel<                                      \
@@ -494,7 +494,7 @@ torch::Tensor block_all_reduce_sum_##packed_type##_##acc_type(torch::Tensor a) {
   auto sum = torch::zeros({1}, options);                                                           \
   const int N = a.size(0);                                                                         \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                                     \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;                  \
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                                      \
   dim3 block(NUM_THREADS_PER_BLOCK);                                                               \
   dim3 grid(NUM_BLOCKS);                                                                           \
   block_all_reduce_sum_##packed_type##_##acc_type##_kernel<                                        \
diff --git a/relu/relu.cu b/relu/relu.cu
@@ -76,7 +76,7 @@ torch::Tensor relu_##packed_type(torch::Tensor x) {
   const int N = x.size(0);                                                       \
   auto y = torch::zeros({N}, options);                                           \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;\
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                             \
   dim3 grid(NUM_BLOCKS);                                                         \
   relu_##packed_type##_kernel<<<grid, block>>>(                                  \
@@ -92,18 +92,18 @@ void relu_##packed_type##_v2(torch::Tensor x, torch::Tensor y) {
   const int N = x.size(0);                                                       \
   CHECK_TORCH_TENSOR_SHAPE(y, N)                                                 \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;\
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                             \
   dim3 grid(NUM_BLOCKS);                                                         \
   relu_##packed_type##_kernel<<<grid, block>>>(                                  \
       reinterpret_cast<element_type*>(x.data_ptr()),                             \
       reinterpret_cast<element_type*>(y.data_ptr()), N);                         \
 }
 
-TORCH_BINDING_RELU(f32,    torch::kFloat32,    float,    1)
-TORCH_BINDING_RELU(f32x4,  torch::kFloat32,    float,    4)
-TORCH_BINDING_RELU(f16,    torch::kHalf,       half,     1)
-TORCH_BINDING_RELU(f16x2,  torch::kHalf,       half,     2)
+TORCH_BINDING_RELU(f32,       torch::kFloat32,    float,    1)
+TORCH_BINDING_RELU(f32x4,     torch::kFloat32,    float,    4)
+TORCH_BINDING_RELU(f16,       torch::kHalf,       half,     1)
+TORCH_BINDING_RELU(f16x2,     torch::kHalf,       half,     2)
 TORCH_BINDING_RELU_V2(f32,    torch::kFloat32,    float,    1)
 TORCH_BINDING_RELU_V2(f32x4,  torch::kFloat32,    float,    4)
 TORCH_BINDING_RELU_V2(f16,    torch::kHalf,       half,     1)
diff --git a/sigmoid/sigmoid.cu b/sigmoid/sigmoid.cu
@@ -56,7 +56,7 @@ torch::Tensor sigmoid_##packed_type(torch::Tensor x) {
   const int N = x.size(0);                                                       \
   auto y = torch::zeros({N}, options);                                           \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;\
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                             \
   dim3 grid(NUM_BLOCKS);                                                         \
   sigmoid_##packed_type##_kernel<<<grid, block>>>(                               \
@@ -72,16 +72,16 @@ void sigmoid_##packed_type##_v2(torch::Tensor x, torch::Tensor y) {
   const int N = x.size(0);                                                       \
   CHECK_TORCH_TENSOR_SHAPE(y, N)                                                 \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;\
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                             \
   dim3 grid(NUM_BLOCKS);                                                         \
   sigmoid_##packed_type##_kernel<<<grid, block>>>(                               \
       reinterpret_cast<element_type*>(x.data_ptr()),                             \
       reinterpret_cast<element_type*>(y.data_ptr()), N);                         \
 }
 
-TORCH_BINDING_SIGMOID(f32,    torch::kFloat32,    float,    1)
-TORCH_BINDING_SIGMOID(f32x4,  torch::kFloat32,    float,    4)
+TORCH_BINDING_SIGMOID(f32,       torch::kFloat32,    float,    1)
+TORCH_BINDING_SIGMOID(f32x4,     torch::kFloat32,    float,    4)
 TORCH_BINDING_SIGMOID_V2(f32,    torch::kFloat32,    float,    1)
 TORCH_BINDING_SIGMOID_V2(f32x4,  torch::kFloat32,    float,    4)
 
diff --git a/softmax/softmax.cu b/softmax/softmax.cu
@@ -230,7 +230,7 @@ torch::Tensor softmax_##packed_type(torch::Tensor x) {
   auto y = torch::zeros({N}, options);                                           \
   auto total = torch::zeros({1}, options);                                       \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;\
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                             \
   dim3 grid(NUM_BLOCKS);                                                         \
   softmax_##packed_type##_kernel<NUM_THREADS_PER_BLOCK><<<grid, block>>>(        \
@@ -252,7 +252,7 @@ void softmax_##packed_type##_v2(torch::Tensor x, torch::Tensor y) {
   if (y.size(0) != N) {throw std::runtime_error("y size mismatch!"); }           \
   auto total = torch::zeros({1}, options);                                       \
   static const int NUM_THREADS_PER_BLOCK = 256 / (n_elements);                   \
-  const int NUM_BLOCKS = (N + NUM_THREADS_PER_BLOCK - 1) / NUM_THREADS_PER_BLOCK;\
+  const int NUM_BLOCKS = (N + 256 - 1) / 256;                                    \
   dim3 block(NUM_THREADS_PER_BLOCK);                                             \
   dim3 grid(NUM_BLOCKS);                                                         \
   softmax_##packed_type##_kernel<NUM_THREADS_PER_BLOCK><<<grid, block>>>(        \
diff --git a/softmax/softmax.py b/softmax/softmax.py
@@ -24,7 +24,8 @@
 
 def run_benchmark(perf_func: callable, x: torch.Tensor, 
                   tag: str, out: Optional[torch.Tensor] = None, 
-                  warmup: int = 10, iters: int = 1000):
+                  warmup: int = 10, iters: int = 1000,
+                  show_all: bool = False):
     if out is not None: 
         out.fill_(0)      
     if out is not None:
@@ -50,6 +51,7 @@ def run_benchmark(perf_func: callable, x: torch.Tensor,
     out_val = out.flatten().detach().cpu().numpy().tolist()[:3]
     out_val = [round(v, 8) for v in out_val]
     print(f"{out_info:>20}: {out_val}, time:{mean_time:.8f}ms")
+    if show_all: print(out)
     return out, mean_time