From 94610511daf7d2ccac8c5ff047da43e5a99cad77 Mon Sep 17 00:00:00 2001 From: huafengchun Date: Fri, 26 Apr 2024 07:59:06 +0000 Subject: [PATCH] add q8_t transform --- ggml-cann.cpp | 71 +++++++++++++++++++++++++++++++++++++++-- ggml-cann/aclnn_ops.cpp | 7 ++++ 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/ggml-cann.cpp b/ggml-cann.cpp index 0ab1f1b36a310..0aa36037db32d 100644 --- a/ggml-cann.cpp +++ b/ggml-cann.cpp @@ -142,7 +142,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, cons GGML_ASSERT(tensor->extra == nullptr); GGML_ASSERT(tensor->op == GGML_OP_NONE); - void *buffer_host; size_t n_bytes = ggml_nbytes(tensor); int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK4_0; @@ -176,7 +175,6 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor* t GGML_ASSERT(tensor->extra == nullptr); GGML_ASSERT(tensor->op == GGML_OP_NONE); - void *buffer_host; size_t n_bytes = ggml_nbytes(tensor); int64_t n_elems = ggml_nelements(tensor); int64_t groups = n_elems / QK4_0; @@ -206,12 +204,66 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor* t } } +#define QK8_0 32 +typedef struct { + uint16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; + +GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, const void *src, void* dst) { + GGML_ASSERT(tensor->extra == nullptr); + GGML_ASSERT(tensor->op == GGML_OP_NONE); + + size_t n_bytes = ggml_nbytes(tensor); + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + + for (int i = 0;id; + scale_offset++; + size_t group_quant_size = QK8_0 * sizeof(uint8_t); + memcpy(quant_offset, group->qs, group_quant_size); + quant_offset += group_quant_size; + } +} + +GGML_CALL static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor* tensor, const void *src, void* dst) { + GGML_ASSERT(tensor->extra == nullptr); + GGML_ASSERT(tensor->op == GGML_OP_NONE); + + size_t n_bytes = ggml_nbytes(tensor); + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + uint8_t* quant_offset = (uint8_t*)src; + uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes); + + for (int i = 0;id = *scale_offset; + scale_offset++; + size_t group_quant_size = QK8_0 * sizeof(uint8_t); + memcpy(group->qs, quant_offset, group_quant_size); + quant_offset += group_quant_size; + } +} + + GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor, const void* src, void *dst) { std::cout<<"Transform tensor:"<name<type) { case GGML_TYPE_Q4_0: ggml_backend_cann_transform_q4_0(tensor, src, dst); break; + case GGML_TYPE_Q8_0: + ggml_backend_cann_transform_q8_0(tensor, src, dst); + break; default: break; } @@ -223,6 +275,9 @@ GGML_CALL static void ggml_backend_cann_transform_back(const ggml_tensor* tensor case GGML_TYPE_Q4_0: ggml_backend_cann_transform_back_q4_0(tensor, src, dst); break; + case GGML_TYPE_Q8_0: + ggml_backend_cann_transform_back_q8_0(tensor, src, dst); + break; default: break; } @@ -231,6 +286,7 @@ GGML_CALL static void ggml_backend_cann_transform_back(const ggml_tensor* tensor GGML_CALL static bool need_transform(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: return true; default: return false; @@ -820,7 +876,16 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, case GGML_OP_MUL_MAT_ID: // embedding case GGML_OP_GET_ROWS: - return false; + { + switch (op->src[0]->type) { + //case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } + break; case GGML_OP_CPY: case GGML_OP_DUP: case GGML_OP_REPEAT: diff --git a/ggml-cann/aclnn_ops.cpp b/ggml-cann/aclnn_ops.cpp index 2e3a78d01709c..ba0b4763e48be 100644 --- a/ggml-cann/aclnn_ops.cpp +++ b/ggml-cann/aclnn_ops.cpp @@ -1687,4 +1687,11 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyScalar(acl_scale)); ACL_CHECK(aclDestroyTensor(temp_tensor)); ACL_CHECK(aclDestroyTensor(temp_output_tensor)); +} + +void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + + } \ No newline at end of file