diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index be9bfa51dccb2..a9655c01465a6 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -284,6 +284,13 @@ static void set_tensor_extra(ggml_backend_buffer_t buffer, tensor_meta_size, ACL_MEMCPY_HOST_TO_DEVICE)); } +static void update_tensor_extra(ggml_tensor* tensor) { + // when tensor->ne/nb changed, make sure ne/nb in extra data also changed. + size_t tensor_meta_size = sizeof(ggml_tensor); + ACL_CHECK(aclrtMemcpy(tensor->extra, tensor_meta_size, tensor, + tensor_meta_size, ACL_MEMCPY_HOST_TO_DEVICE)); +} + GGML_CALL static void ggml_backend_cann_buffer_init_tensor( ggml_backend_buffer_t buffer, ggml_tensor* tensor) { if (tensor->view_src != NULL && tensor->view_offs == 0) { @@ -643,7 +650,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - // Do nothing with these ops. + update_tensor_extra(dst); break; case GGML_OP_DIAG_MASK_INF: ggml_cann_diag_mask(ctx, dst, -INFINITY); @@ -845,9 +852,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor* node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || - node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || - node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_NONE) { continue; } diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 7c0c60b251e81..623a39bdd6617 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -764,15 +764,6 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src = create_acl_tensor(src); aclTensor* acl_dst = create_acl_tensor(dst); - - // param - dup_param param; - for (int i=0; i<4; i++) { - param.src_ne[i] = src->ne[i]; - param.src_nb[i] = src->nb[i]; - param.dst_ne[i] = dst->ne[i]; - param.dst_nb[i] = dst->nb[i]; - } // TODO: simplefify if (src->type==GGML_TYPE_F16) { @@ -796,17 +787,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // src0 is contigous on first dimension, copy by rows int64_t rows_num = ggml_nrows(src); - // param copy - void *param_buffer; - ACL_CHECK(aclrtMalloc(¶m_buffer, sizeof(dup_param), - ACL_MEM_MALLOC_HUGE_FIRST)); - - ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), - ¶m, sizeof(dup_param), - ACL_MEMCPY_HOST_TO_DEVICE)); - aclrtlaunch_ascendc_dup_by_rows_fp16(rows_num, ctx.stream(), - src->data, dst->data, - param_buffer); + aclrtlaunch_ascendc_dup_by_rows_fp16( + rows_num, ctx.stream(), + src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); return; } GGML_ASSERT(false); @@ -825,19 +812,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { if (src->nb[0] == src_type_size) { // src0 is contigous on first dimension, copy by rows int64_t rows_num = ggml_nrows(src); - // param copy - void *param_buffer; - ACL_CHECK(aclrtMalloc(¶m_buffer, sizeof(dup_param), - ACL_MEM_MALLOC_HUGE_FIRST)); - - ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), - ¶m, sizeof(dup_param), - ACL_MEMCPY_HOST_TO_DEVICE)); - aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(rows_num, - ctx.stream(), - src->data, - dst->data, - param_buffer); + aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32( + rows_num, ctx.stream(), + src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); return; } GGML_ASSERT(false); @@ -869,17 +850,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { if (src->nb[0] == src_type_size) { // src0 is contigous on first dimension, copy by rows int64_t rows_num = ggml_nrows(src); - // param copy - void *param_buffer; - ACL_CHECK(aclrtMalloc(¶m_buffer, sizeof(dup_param), - ACL_MEM_MALLOC_HUGE_FIRST)); - - ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), - ¶m, sizeof(dup_param), - ACL_MEMCPY_HOST_TO_DEVICE)); - aclrtlaunch_ascendc_dup_by_rows_fp32(rows_num, ctx.stream(), - src->data, dst->data, - param_buffer); + aclrtlaunch_ascendc_dup_by_rows_fp32( + rows_num, ctx.stream(), + src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); return; } GGML_ASSERT(false); @@ -901,19 +878,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { if (src->nb[0] == src_type_size) { // src0 is contigous on first dimension, copy by rows int64_t rows_num = ggml_nrows(src); - // param copy - void *param_buffer; - ACL_CHECK(aclrtMalloc(¶m_buffer, sizeof(dup_param), - ACL_MEM_MALLOC_HUGE_FIRST)); - - ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), - ¶m, sizeof(dup_param), - ACL_MEMCPY_HOST_TO_DEVICE)); - aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(rows_num, - ctx.stream(), - src->data, - dst->data, - param_buffer); + aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16( + rows_num, ctx.stream(), + src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); return; } GGML_ASSERT(false); @@ -2237,6 +2208,9 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { memcpy(¶m.attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); memcpy(¶m.beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(¶m.beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + // TODO: ext_factor != 0 + GGML_ASSERT(param.ext_factor == 0); param.n_dims = ((int32_t *) dst->op_params)[1]; param.n_orig_ctx = ((int32_t *) dst->op_params)[4]; diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h index 6ea8a97e79575..af0a01c74647c 100644 --- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h +++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h @@ -15,6 +15,5 @@ #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h" #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h" -#include "dup.h" #endif // ASCENDC_KERNELS_H \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp index 3da38d918bda7..c5d086d3f1d7f 100644 --- a/ggml/src/ggml-cann/kernels/dup.cpp +++ b/ggml/src/ggml-cann/kernels/dup.cpp @@ -1,5 +1,4 @@ #include "kernel_operator.h" -#include "dup.h" #include @@ -11,7 +10,8 @@ template class DupByRows { public: __aicore__ inline DupByRows() {} - __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, dup_param& param) { + __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub, + size_t *input_nb_ub) { /* Dup by rows when src is contigous on first dimension and dst is contiguous, each kernel process one row. */ @@ -21,24 +21,24 @@ class DupByRows { int64_t op_block_idx = GetBlockIdx(); // param - num_rows = param.src_ne[1] * param.src_ne[2] * param.src_ne[3]; - num_elem = param.src_ne[0]; + num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; + num_elem = input_ne_ub[0]; // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3) - idx_ne3 = op_block_idx / (param.src_ne[1] * param.src_ne[2]); - idx_ne2 = (op_block_idx - idx_ne3 * (param.src_ne[1] * param.src_ne[2])) - / (param.src_ne[1]); - idx_ne1 = op_block_idx - idx_ne3 * (param.src_ne[1] * param.src_ne[2]) - - idx_ne2 * param.src_ne[1]; + idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]); + idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])) + / (input_ne_ub[1]); + idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) + - idx_ne2 * input_ne_ub[1]; // src may not contiguous in dim [1,2,3], so stride decited by ne&nb - src_stride = param.src_nb[3] * idx_ne3 + param.src_nb[2] * idx_ne2 - + param.src_nb[1] * idx_ne1; + src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2 + + input_nb_ub[1] * idx_ne1; // dst is contiguous - dst_stride = (idx_ne3 * (param.src_ne[1] * param.src_ne[2]) + - idx_ne2 * param.src_ne[1] + - idx_ne1) * (param.src_ne[0] * sizeof(DST_T)); + dst_stride = (idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) + + idx_ne2 * input_ne_ub[1] + + idx_ne1) * (input_ne_ub[0] * sizeof(DST_T)); src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src + src_stride)); @@ -132,80 +132,94 @@ __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { } } -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR param) { +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16( + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, + GM_ADDR output_ne_gm, + GM_ADDR output_nb_gm) { - // copy params from gm to ub. - dup_param param_ub; - auto param_gm_ptr = (__gm__ uint8_t*)param; - auto param_ub_ptr = (uint8_t*)¶m_ub; + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; - for (int32_t i = 0; i < static_cast(sizeof(dup_param) / sizeof(uint8_t)); - ++i, ++param_gm_ptr, ++param_ub_ptr) { - *param_ub_ptr = *param_gm_ptr; - } + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); DupByRows op; - op.init(src_gm, dst_gm, param_ub); + op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); op.dup(); } -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR param) { - - // copy params from gm to ub. - dup_param param_ub; - auto param_gm_ptr = (__gm__ uint8_t*)param; - auto param_ub_ptr = (uint8_t*)¶m_ub; - - for (int32_t i = 0; i < static_cast(sizeof(dup_param) / sizeof(uint8_t)); - ++i, ++param_gm_ptr, ++param_ub_ptr) { - *param_ub_ptr = *param_gm_ptr; - } +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32( + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, + GM_ADDR output_ne_gm, + GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); DupByRows op; - op.init(src_gm, dst_gm, param_ub); + op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); op.dup(); } extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR param) { - - // copy params from gm to ub. - dup_param param_ub; - auto param_gm_ptr = (__gm__ uint8_t*)param; - auto param_ub_ptr = (uint8_t*)¶m_ub; - - for (int32_t i = 0; i < static_cast(sizeof(dup_param) / sizeof(uint8_t)); - ++i, ++param_gm_ptr, ++param_ub_ptr) { - *param_ub_ptr = *param_gm_ptr; - } + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, + GM_ADDR output_ne_gm, + GM_ADDR output_nb_gm) { + + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); DupByRows op; - op.init(src_gm, dst_gm, param_ub); + op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); op.dup_with_cast(); } extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR param) { + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, + GM_ADDR output_ne_gm, + GM_ADDR output_nb_gm) { // copy params from gm to ub. - dup_param param_ub; - auto param_gm_ptr = (__gm__ uint8_t*)param; - auto param_ub_ptr = (uint8_t*)¶m_ub; + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; - for (int32_t i = 0; i < static_cast(sizeof(dup_param) / sizeof(uint8_t)); - ++i, ++param_gm_ptr, ++param_ub_ptr) { - *param_ub_ptr = *param_gm_ptr; - } + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); DupByRows op; - op.init(src_gm, dst_gm, param_ub); + op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); op.dup_with_cast(); } \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/dup.h b/ggml/src/ggml-cann/kernels/dup.h deleted file mode 100644 index d58cd15ef2563..0000000000000 --- a/ggml/src/ggml-cann/kernels/dup.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef DUP_H -#define DUP_H - -#pragma pack(push, 8) -typedef struct { - int64_t src_ne[4]; - int64_t src_nb[4]; - int64_t dst_ne[4]; - int64_t dst_nb[4]; - -} dup_param; -#pragma pack(pop) - -#endif //DUP_H \ No newline at end of file