Skip to content

Commit

Permalink
del dup.h && update tensor extra
Browse files Browse the repository at this point in the history
  • Loading branch information
wangshuai09 committed Jul 2, 2024
1 parent 61d0eef commit cc54c6e
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 141 deletions.
13 changes: 9 additions & 4 deletions ggml/src/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,13 @@ static void set_tensor_extra(ggml_backend_buffer_t buffer,
tensor_meta_size, ACL_MEMCPY_HOST_TO_DEVICE));
}

static void update_tensor_extra(ggml_tensor* tensor) {
// when tensor->ne/nb changed, make sure ne/nb in extra data also changed.
size_t tensor_meta_size = sizeof(ggml_tensor);
ACL_CHECK(aclrtMemcpy(tensor->extra, tensor_meta_size, tensor,
tensor_meta_size, ACL_MEMCPY_HOST_TO_DEVICE));
}

GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
if (tensor->view_src != NULL && tensor->view_offs == 0) {
Expand Down Expand Up @@ -643,7 +650,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
// Do nothing with these ops.
update_tensor_extra(dst);
break;
case GGML_OP_DIAG_MASK_INF:
ggml_cann_diag_mask(ctx, dst, -INFINITY);
Expand Down Expand Up @@ -845,9 +852,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor* node = cgraph->nodes[i];

if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE ||
node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW ||
node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
continue;
}

Expand Down
88 changes: 31 additions & 57 deletions ggml/src/ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -764,15 +764,6 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

aclTensor* acl_src = create_acl_tensor(src);
aclTensor* acl_dst = create_acl_tensor(dst);

// param
dup_param param;
for (int i=0; i<4; i++) {
param.src_ne[i] = src->ne[i];
param.src_nb[i] = src->nb[i];
param.dst_ne[i] = dst->ne[i];
param.dst_nb[i] = dst->nb[i];
}

// TODO: simplefify
if (src->type==GGML_TYPE_F16) {
Expand All @@ -796,17 +787,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// src0 is contigous on first dimension, copy by rows
int64_t rows_num = ggml_nrows(src);

// param copy
void *param_buffer;
ACL_CHECK(aclrtMalloc(&param_buffer, sizeof(dup_param),
ACL_MEM_MALLOC_HUGE_FIRST));

ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param),
&param, sizeof(dup_param),
ACL_MEMCPY_HOST_TO_DEVICE));
aclrtlaunch_ascendc_dup_by_rows_fp16(rows_num, ctx.stream(),
src->data, dst->data,
param_buffer);
aclrtlaunch_ascendc_dup_by_rows_fp16(
rows_num, ctx.stream(),
src->data, dst->data,
((ggml_tensor*)src->extra)->ne,
((ggml_tensor*)src->extra)->nb,
((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb);
return;
}
GGML_ASSERT(false);
Expand All @@ -825,19 +812,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
if (src->nb[0] == src_type_size) {
// src0 is contigous on first dimension, copy by rows
int64_t rows_num = ggml_nrows(src);
// param copy
void *param_buffer;
ACL_CHECK(aclrtMalloc(&param_buffer, sizeof(dup_param),
ACL_MEM_MALLOC_HUGE_FIRST));

ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param),
&param, sizeof(dup_param),
ACL_MEMCPY_HOST_TO_DEVICE));
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(rows_num,
ctx.stream(),
src->data,
dst->data,
param_buffer);
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
rows_num, ctx.stream(),
src->data, dst->data,
((ggml_tensor*)src->extra)->ne,
((ggml_tensor*)src->extra)->nb,
((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb);
return;
}
GGML_ASSERT(false);
Expand Down Expand Up @@ -869,17 +850,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
if (src->nb[0] == src_type_size) {
// src0 is contigous on first dimension, copy by rows
int64_t rows_num = ggml_nrows(src);
// param copy
void *param_buffer;
ACL_CHECK(aclrtMalloc(&param_buffer, sizeof(dup_param),
ACL_MEM_MALLOC_HUGE_FIRST));

ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param),
&param, sizeof(dup_param),
ACL_MEMCPY_HOST_TO_DEVICE));
aclrtlaunch_ascendc_dup_by_rows_fp32(rows_num, ctx.stream(),
src->data, dst->data,
param_buffer);
aclrtlaunch_ascendc_dup_by_rows_fp32(
rows_num, ctx.stream(),
src->data, dst->data,
((ggml_tensor*)src->extra)->ne,
((ggml_tensor*)src->extra)->nb,
((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb);
return;
}
GGML_ASSERT(false);
Expand All @@ -901,19 +878,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
if (src->nb[0] == src_type_size) {
// src0 is contigous on first dimension, copy by rows
int64_t rows_num = ggml_nrows(src);
// param copy
void *param_buffer;
ACL_CHECK(aclrtMalloc(&param_buffer, sizeof(dup_param),
ACL_MEM_MALLOC_HUGE_FIRST));

ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param),
&param, sizeof(dup_param),
ACL_MEMCPY_HOST_TO_DEVICE));
aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(rows_num,
ctx.stream(),
src->data,
dst->data,
param_buffer);
aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
rows_num, ctx.stream(),
src->data, dst->data,
((ggml_tensor*)src->extra)->ne,
((ggml_tensor*)src->extra)->nb,
((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb);
return;
}
GGML_ASSERT(false);
Expand Down Expand Up @@ -2237,6 +2208,9 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
memcpy(&param.attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&param.beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&param.beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));

// TODO: ext_factor != 0
GGML_ASSERT(param.ext_factor == 0);

param.n_dims = ((int32_t *) dst->op_params)[1];
param.n_orig_ctx = ((int32_t *) dst->op_params)[4];
Expand Down
1 change: 0 additions & 1 deletion ggml/src/ggml-cann/kernels/ascendc_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,5 @@
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
#include "dup.h"

#endif // ASCENDC_KERNELS_H
144 changes: 79 additions & 65 deletions ggml/src/ggml-cann/kernels/dup.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#include "kernel_operator.h"
#include "dup.h"

#include <cmath>

Expand All @@ -11,7 +10,8 @@ template <typename SRC_T, typename DST_T>
class DupByRows {
public:
__aicore__ inline DupByRows() {}
__aicore__ inline void init(GM_ADDR src, GM_ADDR dst, dup_param& param) {
__aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
size_t *input_nb_ub) {
/* Dup by rows when src is contigous on first dimension and dst is
contiguous, each kernel process one row.
*/
Expand All @@ -21,24 +21,24 @@ class DupByRows {
int64_t op_block_idx = GetBlockIdx();

// param
num_rows = param.src_ne[1] * param.src_ne[2] * param.src_ne[3];
num_elem = param.src_ne[0];
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
num_elem = input_ne_ub[0];

// index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
idx_ne3 = op_block_idx / (param.src_ne[1] * param.src_ne[2]);
idx_ne2 = (op_block_idx - idx_ne3 * (param.src_ne[1] * param.src_ne[2]))
/ (param.src_ne[1]);
idx_ne1 = op_block_idx - idx_ne3 * (param.src_ne[1] * param.src_ne[2])
- idx_ne2 * param.src_ne[1];
idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
/ (input_ne_ub[1]);
idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
- idx_ne2 * input_ne_ub[1];

// src may not contiguous in dim [1,2,3], so stride decited by ne&nb
src_stride = param.src_nb[3] * idx_ne3 + param.src_nb[2] * idx_ne2
+ param.src_nb[1] * idx_ne1;
src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
+ input_nb_ub[1] * idx_ne1;

// dst is contiguous
dst_stride = (idx_ne3 * (param.src_ne[1] * param.src_ne[2]) +
idx_ne2 * param.src_ne[1] +
idx_ne1) * (param.src_ne[0] * sizeof(DST_T));
dst_stride = (idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) +
idx_ne2 * input_ne_ub[1] +
idx_ne1) * (input_ne_ub[0] * sizeof(DST_T));

src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
src_stride));
Expand Down Expand Up @@ -132,80 +132,94 @@ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
}
}

extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR param) {
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm,
GM_ADDR output_ne_gm,
GM_ADDR output_nb_gm) {

// copy params from gm to ub.
dup_param param_ub;
auto param_gm_ptr = (__gm__ uint8_t*)param;
auto param_ub_ptr = (uint8_t*)&param_ub;
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];

for (int32_t i = 0; i < static_cast<int32_t>(sizeof(dup_param) / sizeof(uint8_t));
++i, ++param_gm_ptr, ++param_ub_ptr) {
*param_ub_ptr = *param_gm_ptr;
}
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);

DupByRows<half, half> op;
op.init(src_gm, dst_gm, param_ub);
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup();
}

extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR param) {

// copy params from gm to ub.
dup_param param_ub;
auto param_gm_ptr = (__gm__ uint8_t*)param;
auto param_ub_ptr = (uint8_t*)&param_ub;

for (int32_t i = 0; i < static_cast<int32_t>(sizeof(dup_param) / sizeof(uint8_t));
++i, ++param_gm_ptr, ++param_ub_ptr) {
*param_ub_ptr = *param_gm_ptr;
}
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm,
GM_ADDR output_ne_gm,
GM_ADDR output_nb_gm) {
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];

copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);

DupByRows<float_t, float_t> op;
op.init(src_gm, dst_gm, param_ub);
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup();
}

extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR param) {

// copy params from gm to ub.
dup_param param_ub;
auto param_gm_ptr = (__gm__ uint8_t*)param;
auto param_ub_ptr = (uint8_t*)&param_ub;

for (int32_t i = 0; i < static_cast<int32_t>(sizeof(dup_param) / sizeof(uint8_t));
++i, ++param_gm_ptr, ++param_ub_ptr) {
*param_ub_ptr = *param_gm_ptr;
}
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm,
GM_ADDR output_ne_gm,
GM_ADDR output_nb_gm) {

int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];

copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);

DupByRows<float_t, half> op;
op.init(src_gm, dst_gm, param_ub);
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup_with_cast();
}

extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR param) {
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm,
GM_ADDR output_ne_gm,
GM_ADDR output_nb_gm) {

// copy params from gm to ub.
dup_param param_ub;
auto param_gm_ptr = (__gm__ uint8_t*)param;
auto param_ub_ptr = (uint8_t*)&param_ub;
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];

for (int32_t i = 0; i < static_cast<int32_t>(sizeof(dup_param) / sizeof(uint8_t));
++i, ++param_gm_ptr, ++param_ub_ptr) {
*param_ub_ptr = *param_gm_ptr;
}
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);

DupByRows<half, float_t> op;
op.init(src_gm, dst_gm, param_ub);
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup_with_cast();
}
14 changes: 0 additions & 14 deletions ggml/src/ggml-cann/kernels/dup.h

This file was deleted.

0 comments on commit cc54c6e

Please sign in to comment.