Skip to content

Commit

Permalink
format
Browse files Browse the repository at this point in the history
  • Loading branch information
wangshuai09 committed Jul 8, 2024
1 parent 34f957f commit e9a550c
Show file tree
Hide file tree
Showing 18 changed files with 309 additions and 315 deletions.
2 changes: 1 addition & 1 deletion ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,7 @@ if (LLAMA_CANN)

# Set header and libs
if(LLAMA_CANN)
set(CANN_INCLUDE_DIRS
set(CANN_INCLUDE_DIRS
${CANN_INSTALL_DIR}/include
${CANN_INSTALL_DIR}/include/aclnn
${CANN_INSTALL_DIR}/acllib/include
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__,
node->name, ggml_op_name(node->op));
}
// if not synchronize, aclrtSynchronizeStream in
// if not synchronize, aclrtSynchronizeStream in
// ggml_backend_cann_synchronize() will raise error.
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
GGML_ASSERT(ok);
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/acl_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@ struct OpCaller {
OpCaller& run(aclrtStream stream = nullptr);
};

#endif // CANN_ACL_OPS
#endif // CANN_ACL_OPS
6 changes: 3 additions & 3 deletions ggml/src/ggml-cann/acl_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,

aclTensor* acl_tensor = aclCreateTensor(
acl_ne, dims, type_mapping(tensor->type), acl_stride,
offset / ggml_element_size(tensor), format, &acl_storage_ne, 1,
offset / ggml_element_size(tensor), format, &acl_storage_ne, 1,
tensor->data);

return acl_tensor;
Expand All @@ -79,8 +79,8 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
std::reverse(tmp_ne, tmp_ne + dims);
std::reverse(tmp_stride, tmp_stride + dims);

aclTensor* acl_tensor = aclCreateTensor(tmp_ne, dims, dtype, tmp_stride,
offset / type_size, format, tmp_ne,
aclTensor* acl_tensor = aclCreateTensor(tmp_ne, dims, dtype, tmp_stride,
offset / type_size, format, tmp_ne,
dims, data_ptr);

return acl_tensor;
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/acl_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,

#define BCAST_PARAM(src) bcast_ne_##src, bcast_nb_##src, bcast_dims

#endif // CANN_ACL_TENSOR_H
#endif // CANN_ACL_TENSOR_H
395 changes: 195 additions & 200 deletions ggml/src/ggml-cann/aclnn_ops.cpp

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions ggml/src/ggml-cann/aclnn_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);

void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);

void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
ggml_tensor* dst);

template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
Expand Down Expand Up @@ -179,4 +179,4 @@ void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyTensor(acl_dst));
}

#endif // CANN_ACLNN_OPS
#endif // CANN_ACLNN_OPS
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,4 +142,4 @@ struct ggml_backend_cann_context {
aclrtStream stream() { return stream(0); }
};

#endif // CANN_COMMON_H
#endif // CANN_COMMON_H
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/kernels/ascendc_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"

#endif // ASCENDC_KERNELS_H
#endif // ASCENDC_KERNELS_H
62 changes: 31 additions & 31 deletions ggml/src/ggml-cann/kernels/dup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ template <typename SRC_T, typename DST_T>
class DupByRows {
public:
__aicore__ inline DupByRows() {}
__aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
__aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
size_t *input_nb_ub) {
/* Dup by rows when src is contigous on first dimension and dst is
/* Dup by rows when src is contigous on first dimension and dst is
contiguous, each kernel process one row.
*/

Expand All @@ -23,49 +23,49 @@ class DupByRows {
// param
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
num_elem = input_ne_ub[0];
// index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)

// index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
/ (input_ne_ub[1]);
idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
- idx_ne2 * input_ne_ub[1];

// src may not contiguous in dim [1,2,3], so stride decited by ne&nb
src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
+ input_nb_ub[1] * idx_ne1;

// dst is contiguous
dst_stride = (idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) +
idx_ne2 * input_ne_ub[1] +
dst_stride = (idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) +
idx_ne2 * input_ne_ub[1] +
idx_ne1) * (input_ne_ub[0] * sizeof(DST_T));
src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +

src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
src_stride));
dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
dst_stride));

pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
32 - 1) / 32 * 32);
pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
32 - 1) / 32 * 32);
}

__aicore__ inline void copy_in() {
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();

DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
DataCopyPadExtParams<SRC_T> padParams;
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);

src_queue.EnQue(src_local);
}

__aicore__ inline void copy_out() {
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();

DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
Expand All @@ -77,13 +77,13 @@ class DupByRows {
__aicore__ inline void dup() {
// main process, copy one row data from src to dst.
copy_in();

LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();

int32_t BLOCK_NUM = 32 / sizeof(DST_T);
DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
/ BLOCK_NUM * BLOCK_NUM);
DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
/ BLOCK_NUM * BLOCK_NUM);
dst_queue.EnQue<DST_T>(dst_local);

src_queue.FreeTensor(src_local);
Expand All @@ -94,19 +94,19 @@ class DupByRows {
// main process, copy one row data from src to dst.
// cast dtype from src to dst.
copy_in();

LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);

Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
dst_queue.EnQue<DST_T>(dst_local);

src_queue.FreeTensor(src_local);
copy_out();
}

private:

TPipe pipe;
GlobalTensor<SRC_T> src_gm;
GlobalTensor<DST_T> dst_gm;
Expand All @@ -118,7 +118,7 @@ class DupByRows {
int64_t idx_ne1;
int64_t src_stride;
int64_t dst_stride;

TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
};
Expand Down Expand Up @@ -152,7 +152,7 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(

DupByRows<half, half> op;
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup();
op.dup();
}

extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
Expand All @@ -174,7 +174,7 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(

DupByRows<float_t, float_t> op;
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup();
op.dup();
}

extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
Expand All @@ -197,7 +197,7 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(

DupByRows<float_t, half> op;
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup_with_cast();
op.dup_with_cast();
}

extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
Expand All @@ -221,5 +221,5 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(

DupByRows<half, float_t> op;
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup_with_cast();
}
op.dup_with_cast();
}
16 changes: 8 additions & 8 deletions ggml/src/ggml-cann/kernels/get_row_f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,35 +60,35 @@ class GET_ROW_F16 {

__aicore__ inline void copy_in(uint32_t offset, size_t len) {
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
size_t tail = len % 32;
size_t tail = len % 32;
len = len & ~31;
DataCopy(input_local, input_gm[offset], len);
if(tail != 0) {
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(half);
DataCopyPadExtParams<half> padParams;
DataCopyPad(input_local[len], input_gm[offset + len],
DataCopyPad(input_local[len], input_gm[offset + len],
dataCopyParams, padParams);
}
input_queue.EnQue(input_local);
}

__aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
size_t tail = len % 32;
size_t tail = len % 32;
len = len & ~31;
DataCopy(output_gm[offset], output_local, len);
if(tail != 0) {
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len],
DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams);
}
output_queue.FreeTensor(output_local);
}

__aicore__ inline void calculate_row(int64_t idx) {
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
const int64_t indices_ne1_idx =
Expand All @@ -114,8 +114,8 @@ class GET_ROW_F16 {
copy_in(input_offset, input_ne[0]);
LocalTensor<half> input_local = input_queue.DeQue<half>();
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
Cast(output_local, input_local, RoundMode::CAST_NONE,

Cast(output_local, input_local, RoundMode::CAST_NONE,
local_buffer_elems);
output_queue.EnQue(output_local);
copy_out(output_offset, input_ne[0]);
Expand Down Expand Up @@ -183,4 +183,4 @@ extern "C" __global__ __aicore__ void ascendc_get_row_f16(
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}
}
14 changes: 7 additions & 7 deletions ggml/src/ggml-cann/kernels/get_row_f32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,35 +55,35 @@ class GET_ROW_F32 {

__aicore__ inline void copy_in(uint32_t offset, size_t len) {
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
size_t tail = len % 32;
size_t tail = len % 32;
len = len & ~31;
DataCopy(input_local, input_gm[offset], len);
if(tail != 0) {
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPadExtParams<float> padParams;
DataCopyPad(input_local[len], input_gm[offset + len],
DataCopyPad(input_local[len], input_gm[offset + len],
dataCopyParams, padParams);
}
input_queue.EnQue(input_local);
}

__aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
size_t tail = len % 32;
size_t tail = len % 32;
len = len & ~31;
DataCopy(output_gm[offset], output_local, len);
if(tail != 0) {
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len],
DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams);
}
output_queue.FreeTensor(output_local);
}

__aicore__ inline void calculate_row(int64_t idx) {
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
const int64_t indices_ne1_idx =
Expand All @@ -109,7 +109,7 @@ class GET_ROW_F32 {
copy_in(input_offset, input_ne[0]);
LocalTensor<float> input_local = input_queue.DeQue<float>();
LocalTensor<float> output_local = output_queue.AllocTensor<float>();

DataCopy(output_local, input_local, local_buffer_elems);
output_queue.EnQue(output_local);
copy_out(output_offset, input_ne[0]);
Expand Down Expand Up @@ -177,4 +177,4 @@ extern "C" __global__ __aicore__ void ascendc_get_row_f32(
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}
}
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,4 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}
}
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,4 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}
}
Loading

0 comments on commit e9a550c

Please sign in to comment.