InfiniTensor · xgqdut2016 · Nov 27, 2024 · Nov 27, 2024 · Nov 28, 2024 · Dec 6, 2024
diff --git a/include/ops/layer_norm/layer_norm.h b/include/ops/layer_norm/layer_norm.h
@@ -0,0 +1,30 @@
+#ifndef LAYER_NORM_H
+#define LAYER_NORM_H
+
+#include "../../export.h"
+#include "../../operators.h"
+
+typedef struct LayerNormDescriptor {
+    Device device;
+} LayerNormDescriptor;
+
+typedef LayerNormDescriptor *infiniopLayerNormDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateLayerNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopLayerNormDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t y_desc,
+    float epsilon);
+
+
+__C infiniopStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor_t desc, uint64_t *size);
+__C __export infiniopStatus_t infiniopLayerNorm(infiniopLayerNormDescriptor_t desc, void *workspace,
+                                          uint64_t workspace_size,
+                                              void const *x, void const *w, void const *b, void *y, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc);
+
+#endif
diff --git a/operatorspy/tests/layer_norm.py b/operatorspy/tests/layer_norm.py
@@ -0,0 +1,168 @@
+from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
+import ctypes
+import sys
+import os
+
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+from operatorspy import (
+    open_lib,
+    to_tensor,
+    DeviceEnum,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    create_handle,
+    destroy_handle,
+    create_workspace,
+    check_error,
+    rearrange_tensor,
+)
+
+from operatorspy.tests.test_utils import get_args
+import torch
+import torch.nn as nn
+
+class LayerNormDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopLayerNormDescriptor_t = POINTER(LayerNormDescriptor)
+
+
+def LayerNormFunction(input, scale, bias, eps):
+    normlize_shape = scale.shape
+    layer_norm = nn.LayerNorm(normlize_shape, elementwise_affine=True, eps = eps)
+    layer_norm.weight.data = scale
+    layer_norm.bias.data = bias
+    return layer_norm.forward(input)
+
+
+def test(lib, handle, torch_device, x_shape, axis, x_dtype=torch.float16):
+    print(
+        f"Testing Layernorm on {torch_device} with test_shape:{x_shape}, axis:{axis} ,dtype:{x_dtype}"
+    )
+    eps = 1e-5
+    ndim = len(x_shape)
+    normlize_shape = []
+    for i in range(axis, ndim):
+        normlize_shape.append(x_shape[i])
+
+    x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
+    scale = torch.rand(normlize_shape, dtype=x_dtype).to(torch_device)
+    bias = torch.rand(normlize_shape, dtype=x_dtype).to(torch_device)
+    y = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
+    ans = LayerNormFunction(x, scale, bias, eps)
+    x_tensor = to_tensor(x, lib)
+    w_tensor = to_tensor(scale, lib)
+    b_tensor = to_tensor(bias, lib)
+    y_tensor = to_tensor(y, lib)
+    descriptor = infiniopLayerNormDescriptor_t()
+    check_error(
+        lib.infiniopCreateLayerNormDescriptor(
+            handle, ctypes.byref(descriptor), x_tensor.descriptor, w_tensor.descriptor, b_tensor.descriptor, y_tensor.descriptor, eps
+        )
+    )
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetLayerNormWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = create_workspace(workspace_size.value, torch_device) 
+    check_error(
+        lib.infiniopLayerNorm(
+            descriptor,
+            workspace.data_ptr() if workspace is not None else None,
+            workspace_size.value,
+            x_tensor.data,
+            w_tensor.data,
+            b_tensor.data,
+            y_tensor.data,
+            None,
+        )
+    )
+    err = y.reshape(-1,1) - ans.reshape(-1,1)
+    print(max(abs(err)))
+    assert torch.allclose(y, ans, atol=1e-3, rtol=1e-3)
+    check_error(lib.infiniopDestroyLayerNormDescriptor(descriptor))
+
+
+def test_cpu(lib, test_cases):
+    device = DeviceEnum.DEVICE_CPU
+    handle = create_handle(lib, device)
+    for x_shape, axis, x_dtype in test_cases:
+        test(lib, handle, "cpu", x_shape, axis, x_dtype)
+    destroy_handle(lib, handle)
+
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for x_shape, axis, x_dtype in test_cases:
+        test(lib, handle, "cuda", x_shape, axis, x_dtype)
+    destroy_handle(lib, handle)
+
+
+def test_bang(lib, test_cases):
+    import torch_mlu
+
+    device = DeviceEnum.DEVICE_BANG
+    handle = create_handle(lib, device)
+    for x_shape, axis, x_dtype in test_cases:
+        test(lib, handle, "mlu", x_shape, axis, x_dtype)
+    destroy_handle(lib, handle)
+
+
+if __name__ == "__main__":
+    test_cases = [
+        # x_shape, axis
+        # cnnllayernorm不支持axis=0, cpu torch.layernorm不支持half
+        #手写layernorm在float16上精度不足，但是在float32上可以通过测试
+        #((32, 20, 512), 0, torch.float16),
+        ((32, 20, 512), 1, torch.float16), 
+        ((32, 20, 512), 2, torch.float16),
+
+        #((32, 20, 512), 0, torch.float32),
+        ((32, 20, 512), 1, torch.float32), 
+        ((32, 20, 512), 2, torch.float32), 
+
+    ]
+    args = get_args()
+    lib = open_lib()
+    lib.infiniopCreateLayerNormDescriptor.restype = c_int32
+    lib.infiniopCreateLayerNormDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopLayerNormDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+    ]
+
+    lib.infiniopLayerNorm.restype = c_int32
+    lib.infiniopLayerNorm.argtypes = [
+        infiniopLayerNormDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLayerNormDescriptor.restype = c_int32
+    lib.infiniopDestroyLayerNormDescriptor.argtypes = [
+        infiniopLayerNormDescriptor_t,
+    ]
+
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.bang:
+        test_bang(lib, test_cases)
+
+    if not (args.cpu or args.cuda or args.bang):
+        test_cpu(lib, test_cases)
+    print("Test passed!")
diff --git a/src/ops/layer_norm/bang/layer_norm_bang.cc b/src/ops/layer_norm/bang/layer_norm_bang.cc
@@ -0,0 +1,53 @@
+#include "layer_norm_bang.h"
+#include "../../utils.h"
+infiniopStatus_t bangCreateLayerNormDescriptor(BangHandle_t handle, LayerNormBangDescriptor_t *desc_ptr,                                            
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             infiniopTensorDescriptor_t b_desc,
+                                             infiniopTensorDescriptor_t y_desc,
+                                             float epsilon) {
+    if (w_desc->ndim != b_desc->ndim) {
+        return STATUS_BAD_TENSOR_SHAPE;
+    }
+    int wDim = w_desc->ndim;
+    for(int i = 0; i < wDim; i++){
+        if(w_desc->shape[i] != b_desc->shape[i]){
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    int ndim = x_desc->ndim;
+    for(int i = 0; i < wDim; i++){
+        if(x_desc->shape[i + ndim - wDim] != w_desc->shape[i]){
+            return STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+    if (!dtype_eq(x_desc->dt, F16) && !dtype_eq(x_desc->dt, F32)) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+    int size = 1;
+    int behindsize = 1;
+    for(int i = 0; i < ndim; i++){
+        size *= static_cast<int>(x_desc->shape[i]);
+        if(i >= ndim - wDim){
+            behindsize *= static_cast<int>(x_desc->shape[i]);
+        } 
+    }
+    *desc_ptr = new LayerNormBangDescriptor{
+        handle->device,
+        handle->device_id,
+        x_desc->dt,
+        size,
+        behindsize,
+        epsilon};
+
+    return STATUS_SUCCESS;
+}
+infiniopStatus_t bangGetLayerNormWorkspaceSize(LayerNormBangDescriptor_t desc, unsigned long int *size) {
+    *size = 32 * sizeof(desc->dtype);//taskDim * sizeof(T),taskDim不超过32
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangDestroyLayerNormDescriptor(LayerNormBangDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
diff --git a/src/ops/layer_norm/bang/layer_norm_bang.h b/src/ops/layer_norm/bang/layer_norm_bang.h
@@ -0,0 +1,36 @@
+#ifndef __BANG_LAYER_NORM_H__
+#define __BANG_LAYER_NORM_H__
+
+#include "../../../devices/bang/bang_handle.h"
+#include "../../utils.h"
+#include "operators.h"
+
+struct LayerNormBangDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    int size;
+    int behindsize;
+    float epsilon;
+};
+
+typedef struct LayerNormBangDescriptor *LayerNormBangDescriptor_t;
+
+infiniopStatus_t bangCreateLayerNormDescriptor(BangHandle_t handle,
+                                             LayerNormBangDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t x_desc,
+                                             infiniopTensorDescriptor_t w_desc,
+                                             infiniopTensorDescriptor_t b_desc,
+                                             infiniopTensorDescriptor_t y_desc,
+                                             float epsilon);
+
+infiniopStatus_t bangGetLayerNormWorkspaceSize(LayerNormBangDescriptor_t desc, unsigned long int *size);
+
+infiniopStatus_t bangLayerNorm(LayerNormBangDescriptor_t desc, void *workspace,
+                                          uint64_t workspace_size,                       
+                             void const *x, void const *w, void const *b, void *y, 
+                             void *stream);
+
+infiniopStatus_t bangDestroyLayerNormDescriptor(LayerNormBangDescriptor_t desc);
+
+#endif// __BANG_LAYER_NORM_H__