Implemented faster kernels optimized for A100 GPUs

efrantar · efrantar · commit 54d35a8979b7 · 2023-03-21T18:52:32.000+01:00
diff --git a/opt.py b/opt.py
@@ -229,7 +229,7 @@ def forward(self, inp, **kwargs):
 def opt_pack3(model, quantizers):
     layers = find_layers(model)
     layers = {n: layers[n] for n in quantizers}
-    make_quant3(model, quantizers)
+    make_quant3(model, quantizers, faster=args.faster_kernel)
     qlayers = find_layers(model, [Quant3Linear])
     print('Packing ...')
     for name in qlayers:
@@ -258,7 +258,7 @@ def noop(*args, **kwargs):
     for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']:
         if name in layers:
             del layers[name]
-    make_quant3(model, layers)
+    make_quant3(model, layers, faster=args.faster_kernel)
 
     print('Loading model ...')
     model.load_state_dict(torch.load(checkpoint))
@@ -416,7 +416,11 @@ def sync():
     )
     parser.add_argument(
         '--new-eval', action='store_true',
-        help='Whether to use the new PTB and C4 eval'
+        help='Whether to use the new PTB and C4 eval.'
+    )
+    parser.add_argument(
+        '--faster-kernel', action='store_true',
+        help='Whether to use the new faster kernel for benchmarking.'
     )
 
     args = parser.parse_args()
diff --git a/quant.py b/quant.py
@@ -136,14 +136,15 @@ def ready(self):
 # Assumes layer is perfectly divisible into 1024 * 1024 blocks
 class Quant3Linear(nn.Module): 
 
-    def __init__(self, infeatures, outfeatures):
+    def __init__(self, infeatures, outfeatures, faster=False):
         super().__init__()
         self.register_buffer('zeros', torch.zeros((outfeatures, 1)))
         self.register_buffer('scales', torch.zeros((outfeatures, 1)))
         self.register_buffer('bias', torch.zeros(outfeatures))
         self.register_buffer(
             'qweight', torch.zeros((infeatures // 32 * 3, outfeatures), dtype=torch.int)
         )
+        self.faster = faster
 
     def pack(self, linear, scales, zeros):
         self.zeros = zeros * scales
@@ -187,21 +188,25 @@ def forward(self, x):
             y = self.bias.clone()
             outshape[-1] = self.bias.numel()
             dtype = x.dtype
-            x = x.float()
-            quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.zeros)
+            if self.faster:
+                x = x.half()
+                quant_cuda.vecquant3matmul_faster(x, self.qweight, y, self.scales, self.zeros)
+            else:
+                x = x.float()
+                quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.zeros)
             y = y.to(dtype)
             return y.reshape(outshape)
         raise ValueError('Only supports a single token currently.')
 
-def make_quant3(module, names, name=''):
+def make_quant3(module, names, name='', faster=False):
     if isinstance(module, Quant3Linear):
         return
     for attr in dir(module):
         tmp = getattr(module, attr)
         name1 = name + '.' + attr if name != '' else attr
         if name1 in names:
             setattr(
-                module, attr, Quant3Linear(tmp.in_features, tmp.out_features)
+                module, attr, Quant3Linear(tmp.in_features, tmp.out_features, faster=faster)
             )
     for name1, child in module.named_children():
-        make_quant3(child, names, name + '.' + name1 if name != '' else name1)
+        make_quant3(child, names, name + '.' + name1 if name != '' else name1, faster=faster)
diff --git a/quant_cuda.cpp b/quant_cuda.cpp
@@ -5,6 +5,11 @@
 void vecquant3matmul_cuda(
   torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
   torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant3matmul_faster_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
 ); 
 
 void vecquant3matmul(
@@ -15,6 +20,15 @@ void vecquant3matmul(
   vecquant3matmul_cuda(vec, mat, mul, scales, zeros);
 }
 
+void vecquant3matmul_faster(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_faster_cuda(vec, mat, mul, scales, zeros);
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant3matmul_faster", &vecquant3matmul_faster, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
 }
diff --git a/quant_cuda_kernel.cu b/quant_cuda_kernel.cu
@@ -2,6 +2,7 @@
 #include <torch/python.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <cuda_fp16.h>
 
 template <typename scalar_t>
 __global__ void VecQuant3MatMulKernel(
@@ -14,8 +15,18 @@ __global__ void VecQuant3MatMulKernel(
     int width
 );
 
-const int BLOCKWIDTH  = 1024;
-const int BLOCKHEIGHT =   96; 
+__global__ void VecQuant3MatMulKernelFaster(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const  float* __restrict__ zeros,
+    int height,
+    int width
+);
+
+const int BLOCKWIDTH  = 256;
+const int BLOCKHEIGHT =  24;
 
 void vecquant3matmul_cuda(
   torch::Tensor vec,
@@ -29,7 +40,7 @@ void vecquant3matmul_cuda(
 
   dim3 blocks(
     (height + BLOCKHEIGHT - 1) / BLOCKHEIGHT,
-    (width + BLOCKWIDTH - 1) / BLOCKWIDTH 
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
   );
   dim3 threads(BLOCKWIDTH);
 
@@ -44,6 +55,32 @@ void vecquant3matmul_cuda(
   );
 }
 
+void vecquant3matmul_faster_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int height = mat.size(0);
+  int width = mat.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT - 1) / BLOCKHEIGHT,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant3MatMulKernelFaster<<<blocks, threads>>>(
+    (half2*) vec.data_ptr(),
+    mat.data_ptr<int>(),
+    mul.data_ptr<float>(),
+    scales.data_ptr<float>(),
+    zeros.data_ptr<float>(),
+    height, width
+  );
+}
+
 __device__ inline unsigned int as_unsigned(int i) {
   return *reinterpret_cast<unsigned int*>(&i);
 }
@@ -126,3 +163,82 @@ __global__ void VecQuant3MatMulKernel(
 
   atomicAdd(&mul[col], res);
 }
+
+__global__ void VecQuant3MatMulKernelFaster(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const  float* __restrict__ zeros,
+    int height,
+    int width
+) {
+  const int blockwidth2 = BLOCKWIDTH / 2;
+
+  int row = BLOCKHEIGHT * blockIdx.x;
+  int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ half2 blockvec[blockwidth2];
+  if (threadIdx.x < blockwidth2)
+    blockvec[threadIdx.x] = vec[(row / BLOCKHEIGHT) * blockwidth2 + threadIdx.x];
+
+  __shared__ half2 deq2[64][32];
+  int val = threadIdx.x / 32;
+  int off = threadIdx.x % 32;
+  for (; val < 64; val += BLOCKWIDTH / 32) {
+    deq2[val][off] = __halves2half2(
+       __int2half_rn(val & 0x7), __int2half_rn(val >> 3)
+    );
+  }
+
+  half2 scale = __float2half2_rn(scales[col]);
+  half2 zero = __float2half2_rn(-zeros[col]);
+
+  int i = width * row + col;
+  int k = 0;
+
+  float res = 0;
+  half2 res2;
+
+  unsigned int tmp1;
+  unsigned int tmp2;
+  unsigned int tmp;
+
+  __syncthreads();
+
+  while (k < blockwidth2) {
+    res2 = {};
+    tmp1 = as_unsigned(mat[i]);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 24) & 0x3f][off], scale, zero), blockvec[k + 4], res2);
+    i += width;
+    tmp2 = as_unsigned(mat[i]);
+    tmp = (tmp1 >> 30) | ((tmp2 << 2) & 0x3c);
+    res2 = __hfma2(__hfma2(deq2[tmp][off], scale, zero), blockvec[k + 5], res2);
+    tmp2 >>= 4;
+    k += 6;
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >>  0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >>  6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
+    i += width;
+    tmp1 = as_unsigned(mat[i]);
+    tmp = (tmp2 >> 24) | ((tmp1 << 4) & 0x30);
+    res2 = __hfma2(__hfma2(deq2[tmp][off], scale, zero), blockvec[k + 4], res2);
+    tmp1 >>= 2;
+    k += 5;
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 24) & 0x3f][off], scale, zero), blockvec[k + 4], res2);
+    i += width;
+    k += 5;
+    res += __half2float(res2.x) + __half2float(res2.y);
+  }
+
+  atomicAdd(&mul[col], res);
+}
diff --git a/test_kernel.py b/test_kernel.py
@@ -10,8 +10,8 @@
 
 DEV = torch.device('cuda:0')
 
-M = 12288
-N = 12288 * 4
+M = 12288 * 4
+N = 12288
 
 DTYPE = torch.half
 mat = torch.randn((M, N), device=DEV, dtype=DTYPE)
@@ -43,6 +43,14 @@
     torch.cuda.synchronize()
 print('3bit:', (time.time() - tick) / COUNT)
 
+COUNT = 1000
+import time
+tick = time.time()
+for _ in range(COUNT):
+    quant_cuda.vecquant3matmul_faster(vec, mat, mul, scales, zeros)
+    torch.cuda.synchronize()
+print('3bit:', (time.time() - tick) / COUNT, '(faster)')
+
 print('Verifiying kernel correctness ...')
 
 M = 4 * 4096
@@ -66,5 +74,7 @@
 layer = layer.to(DEV)
 
 with torch.no_grad():
-    print('Simu:', qlayer(vec))
-    print('Kern:', layer.to(DEV)(vec))
+    print('Simu:', layer.to(DEV)(vec))
+    print('Kern:', qlayer(vec))
+    qlayer.faster = True
+    print('Kern:', qlayer(vec.half()), '(faster)')