Skip to content

layer_norm:CPU,MLU,GPU三个平台的layernorm算子重构 #122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions include/ops/layer_norm/layer_norm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#ifndef LAYER_NORM_H
#define LAYER_NORM_H

#include "../../export.h"
#include "../../operators.h"

typedef struct LayerNormDescriptor {
Device device;
} LayerNormDescriptor;

typedef LayerNormDescriptor *infiniopLayerNormDescriptor_t;

__C __export infiniopStatus_t infiniopCreateLayerNormDescriptor(
infiniopHandle_t handle,
infiniopLayerNormDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t y_desc,
float epsilon);


__C infiniopStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor_t desc, uint64_t *size);
__C __export infiniopStatus_t infiniopLayerNorm(infiniopLayerNormDescriptor_t desc, void *workspace,
uint64_t workspace_size,
void const *x, void const *w, void const *b, void *y, void *stream);

__C __export infiniopStatus_t infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc);

#endif
168 changes: 168 additions & 0 deletions operatorspy/tests/layer_norm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
import ctypes
import sys
import os


sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
create_workspace,
check_error,
rearrange_tensor,
)

from operatorspy.tests.test_utils import get_args
import torch
import torch.nn as nn

class LayerNormDescriptor(Structure):
_fields_ = [("device", c_int32)]


infiniopLayerNormDescriptor_t = POINTER(LayerNormDescriptor)


def LayerNormFunction(input, scale, bias, eps):
normlize_shape = scale.shape
layer_norm = nn.LayerNorm(normlize_shape, elementwise_affine=True, eps = eps)
layer_norm.weight.data = scale
layer_norm.bias.data = bias
return layer_norm.forward(input)


def test(lib, handle, torch_device, x_shape, axis, x_dtype=torch.float16):
print(
f"Testing Layernorm on {torch_device} with test_shape:{x_shape}, axis:{axis} ,dtype:{x_dtype}"
)
eps = 1e-5
ndim = len(x_shape)
normlize_shape = []
for i in range(axis, ndim):
normlize_shape.append(x_shape[i])

x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
scale = torch.rand(normlize_shape, dtype=x_dtype).to(torch_device)
bias = torch.rand(normlize_shape, dtype=x_dtype).to(torch_device)
y = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
ans = LayerNormFunction(x, scale, bias, eps)
x_tensor = to_tensor(x, lib)
w_tensor = to_tensor(scale, lib)
b_tensor = to_tensor(bias, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopLayerNormDescriptor_t()
check_error(
lib.infiniopCreateLayerNormDescriptor(
handle, ctypes.byref(descriptor), x_tensor.descriptor, w_tensor.descriptor, b_tensor.descriptor, y_tensor.descriptor, eps
)
)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetLayerNormWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, torch_device)
check_error(
lib.infiniopLayerNorm(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
x_tensor.data,
w_tensor.data,
b_tensor.data,
y_tensor.data,
None,
)
)
err = y.reshape(-1,1) - ans.reshape(-1,1)
print(max(abs(err)))
assert torch.allclose(y, ans, atol=1e-3, rtol=1e-3)
check_error(lib.infiniopDestroyLayerNormDescriptor(descriptor))


def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, axis, x_dtype in test_cases:
test(lib, handle, "cpu", x_shape, axis, x_dtype)
destroy_handle(lib, handle)


def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, axis, x_dtype in test_cases:
test(lib, handle, "cuda", x_shape, axis, x_dtype)
destroy_handle(lib, handle)


def test_bang(lib, test_cases):
import torch_mlu

device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, axis, x_dtype in test_cases:
test(lib, handle, "mlu", x_shape, axis, x_dtype)
destroy_handle(lib, handle)


if __name__ == "__main__":
test_cases = [
# x_shape, axis
# cnnllayernorm不支持axis=0, cpu torch.layernorm不支持half
#手写layernorm在float16上精度不足,但是在float32上可以通过测试
#((32, 20, 512), 0, torch.float16),
((32, 20, 512), 1, torch.float16),
((32, 20, 512), 2, torch.float16),

#((32, 20, 512), 0, torch.float32),
((32, 20, 512), 1, torch.float32),
((32, 20, 512), 2, torch.float32),

]
args = get_args()
lib = open_lib()
lib.infiniopCreateLayerNormDescriptor.restype = c_int32
lib.infiniopCreateLayerNormDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopLayerNormDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
]

lib.infiniopLayerNorm.restype = c_int32
lib.infiniopLayerNorm.argtypes = [
infiniopLayerNormDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyLayerNormDescriptor.restype = c_int32
lib.infiniopDestroyLayerNormDescriptor.argtypes = [
infiniopLayerNormDescriptor_t,
]

if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)

if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("Test passed!")
53 changes: 53 additions & 0 deletions src/ops/layer_norm/bang/layer_norm_bang.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "layer_norm_bang.h"
#include "../../utils.h"
infiniopStatus_t bangCreateLayerNormDescriptor(BangHandle_t handle, LayerNormBangDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t y_desc,
float epsilon) {
if (w_desc->ndim != b_desc->ndim) {
return STATUS_BAD_TENSOR_SHAPE;
}
int wDim = w_desc->ndim;
for(int i = 0; i < wDim; i++){
if(w_desc->shape[i] != b_desc->shape[i]){
return STATUS_BAD_TENSOR_SHAPE;
}
}
int ndim = x_desc->ndim;
for(int i = 0; i < wDim; i++){
if(x_desc->shape[i + ndim - wDim] != w_desc->shape[i]){
return STATUS_BAD_TENSOR_SHAPE;
}
}
if (!dtype_eq(x_desc->dt, F16) && !dtype_eq(x_desc->dt, F32)) {
return STATUS_BAD_TENSOR_DTYPE;
}
int size = 1;
int behindsize = 1;
for(int i = 0; i < ndim; i++){
size *= static_cast<int>(x_desc->shape[i]);
if(i >= ndim - wDim){
behindsize *= static_cast<int>(x_desc->shape[i]);
}
}
*desc_ptr = new LayerNormBangDescriptor{
handle->device,
handle->device_id,
x_desc->dt,
size,
behindsize,
epsilon};

return STATUS_SUCCESS;
}
infiniopStatus_t bangGetLayerNormWorkspaceSize(LayerNormBangDescriptor_t desc, unsigned long int *size) {
*size = 32 * sizeof(desc->dtype);//taskDim * sizeof(T),taskDim不超过32
return STATUS_SUCCESS;
}

infiniopStatus_t bangDestroyLayerNormDescriptor(LayerNormBangDescriptor_t desc) {
delete desc;
return STATUS_SUCCESS;
}
36 changes: 36 additions & 0 deletions src/ops/layer_norm/bang/layer_norm_bang.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef __BANG_LAYER_NORM_H__
#define __BANG_LAYER_NORM_H__

#include "../../../devices/bang/bang_handle.h"
#include "../../utils.h"
#include "operators.h"

struct LayerNormBangDescriptor {
Device device;
int device_id;
DT dtype;
int size;
int behindsize;
float epsilon;
};

typedef struct LayerNormBangDescriptor *LayerNormBangDescriptor_t;

infiniopStatus_t bangCreateLayerNormDescriptor(BangHandle_t handle,
LayerNormBangDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t y_desc,
float epsilon);

infiniopStatus_t bangGetLayerNormWorkspaceSize(LayerNormBangDescriptor_t desc, unsigned long int *size);

infiniopStatus_t bangLayerNorm(LayerNormBangDescriptor_t desc, void *workspace,
uint64_t workspace_size,
void const *x, void const *w, void const *b, void *y,
void *stream);

infiniopStatus_t bangDestroyLayerNormDescriptor(LayerNormBangDescriptor_t desc);

#endif// __BANG_LAYER_NORM_H__
Loading
Loading