Skip to content

[CANN] weight format to nz for Ascend310P3 #14407

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 52 additions & 10 deletions ggml/src/ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1783,8 +1783,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
bcast_weight_nb[2], bcast_weight_nb[3],
bcast_weight_nb[4], bcast_weight_nb[5]};
aclTensor* acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
aclTensor* acl_weight_tensor;

bool weightToNZ = false;
#ifdef ASCEND_310P
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
#endif
if (weightToNZ && n_dims == 2) {
int64_t acl_stride[2] = {1, transpose_ne[1]};

// Reverse ne.
std::reverse(transpose_ne, transpose_ne + n_dims);

std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};

acl_weight_tensor = aclCreateTensor(
transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
} else {
acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
}
aclTensor* acl_dst =
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);

Expand Down Expand Up @@ -1909,14 +1928,37 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
int64_t output_ne_offset = 0;
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};

aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
aclTensor* acl_weight_tensor;
aclTensor* acl_scale_tensor;

bool weightToNZ = false;
#ifdef ASCEND_310P
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
#endif
if (weightToNZ) {
int64_t acl_weight_stride[] = {weight_ne[1], 1};
std::vector<int64_t> storageDims = {weight_ne[0], weight_ne[1]};
acl_weight_tensor = aclCreateTensor(
weight_ne, 2, ggml_cann_type_mapping(type), acl_weight_stride,
weight_ne_offset / ggml_element_size(src0), ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2,
src0->data);

int64_t acl_scale_stride[] = {scale_ne[1], 1};
std::vector<int64_t> scaleStorageDims = {scale_ne[0], scale_ne[1]};
acl_scale_tensor = aclCreateTensor(
scale_ne, 2, ACL_FLOAT16, acl_scale_stride,
scale_ne_offset, ACL_FORMAT_ND, scaleStorageDims.data(), 2,
scale_offset + batch0 * scale_stride);
} else {
acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
}
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
Expand Down
102 changes: 101 additions & 1 deletion ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include <acl/acl.h>
#include <stdarg.h>
#include <aclnnop/aclnn_trans_matmul_weight.h>

#include <cmath>
#include <cstdio>
Expand Down Expand Up @@ -1115,6 +1116,95 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
return GGML_STATUS_SUCCESS;
}

static bool is_matmul_weight(const ggml_tensor* tensor) {
std::string name = ggml_get_name(tensor);
static const std::unordered_set<std::string> weight_suffixes{
"output.weight",
"attn_q.weight",
"attn_k.weight",
"attn_v.weight",
"attn_output.weight",
"ffn_gate.weight",
"ffn_up.weight",
"ffn_down.weight"
};

for (const auto& suffix : weight_suffixes) {
if (name.find(suffix) != std::string::npos) {
return true;
}
}
return false;
}

static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor)
{
uint64_t size = 1;
for (auto i : shape) {
size *= i;
}

const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));

size *= sizeof(int16_t);

ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);

std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}

// std::vector<int64_t> storageShape;
// storageShape.push_back(size);
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
shape.data(), shape.size(), *deviceAddr);
return 0;
}

static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
aclrtStream stream;
ACL_CHECK(aclrtCreateStream(&stream));

std::vector<int64_t> weightShape = {tensor->ne[0], tensor->ne[1]};
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
void *weightDeviceAddr = nullptr;
void *weightTransposedDeviceAddr = nullptr;
aclTensor *weight = nullptr;
aclTensor *weightTransposed = nullptr;
CreateAclTensorWeight(data, weightShape, &weightDeviceAddr, ggml_cann_type_mapping(tensor->type), &weight);
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
ggml_cann_type_mapping(tensor->type), &weightTransposed);

uint64_t workspaceSize = 0;
aclOpExecutor *executor;
void *workspaceAddr = nullptr;

// TransMatmulWeight
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
if (workspaceSize > 0) {
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
workspaceAddrPtrTrans.reset(workspaceAddr);
}
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));

size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);

aclrtMemcpy((char *)tensor->data + offset, size,
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
ACL_CHECK(aclDestroyTensor(weight));
ACL_CHECK(aclDestroyTensor(weightTransposed));
aclrtFree(weightDeviceAddr);
aclrtFree(weightTransposedDeviceAddr);
if (workspaceSize > 0) {
aclrtFree(workspaceAddr);
}
}

// TODO: need handle tensor which has paddings.
/**
* @brief Set tensor data in a CANN buffer.
Expand All @@ -1139,16 +1229,26 @@ static void ggml_backend_cann_buffer_set_tensor(
// For acl, synchronous functions use this default stream.
// Why aclrtSynchronizeDevice?

bool weightToNZ = false;
#ifdef ASCEND_310P
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
#endif
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
ACL_MEMCPY_HOST_TO_DEVICE));
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
weight_format_to_nz(tensor, data, offset);
}
} else {
void *transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, data, transform_buffer);

ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
transform_buffer, size,
ACL_MEMCPY_HOST_TO_DEVICE));
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
weight_format_to_nz(tensor, transform_buffer, offset);
}
free(transform_buffer);
}
}
Expand Down Expand Up @@ -2044,8 +2144,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
switch (op->src[0]->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
return true;
case GGML_TYPE_Q8_0:
return true;
case GGML_TYPE_Q4_0:
#ifdef ASCEND_310P
// Q4 && Q8 per group is not suppor on 310p device
Expand Down