diff --git a/common/common.cpp b/common/common.cpp
index 57d03a5789edd..16b41b4d1948a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -60,6 +60,10 @@
 #define GGML_USE_CUDA_SYCL_VULKAN
 #endif
 
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_CANN)
+#define GGML_USE_CUDA_SYCL_CANN
+#endif
+
 #if defined(LLAMA_USE_CURL)
 #ifdef __linux__
 #include <linux/limits.h>
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index d641a9f12b388..a6497b6e0bf82 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -23,6 +23,10 @@
 #include "ggml-cuda.h"
 #include "ggml-sycl.h"
 
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
 // utils
 static uint64_t get_time_ns() {
     using clock = std::chrono::high_resolution_clock;
@@ -120,6 +124,17 @@ static std::string get_gpu_info() {
             id += "/";
         }
     }
+#endif
+#ifdef GGML_USE_CANN
+    uint32_t count = ggml_backend_cann_get_device_count();
+    for (uint32_t i = 0; i < count; i++) {
+        char buf[128];
+        ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
+        id += buf;
+        if (i < count - 1) {
+            id += "/";
+        }
+    }
 #endif
     // TODO: other backends
     return id;
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index d6882eec31e1a..4571181879ae8 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -16,6 +16,10 @@
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
@@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     LOG_TEE("%s: CLIP using Metal backend\n", __func__);
 #endif
 
+#ifdef GGML_USE_CANN
+    new_clip->backend = ggml_backend_cann_init(0);
+    printf("%s: CLIP using CANN backend\n", __func__);
+#endif
+
 
     if (!new_clip->backend) {
         new_clip->backend = ggml_backend_cpu_init();
diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h
new file mode 100644
index 0000000000000..a15d6020bbaac
--- /dev/null
+++ b/ggml/include/ggml-cann.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#define GGML_CANN_NAME "CANN"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_CANN_MAX_DEVICES 16
+
+#define QK4_0 32
+typedef struct {
+    uint16_t d;           // delta
+    uint8_t qs[QK4_0 / 2]; // nibbles / quants
+} block_q4_0;
+
+
+#define QK8_0 32
+typedef struct {
+    uint16_t d;       // delta
+    int8_t  qs[QK8_0]; // quants
+} block_q8_0;
+
+// backend API
+GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
+
+GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
+
+// device buffer
+GGML_API GGML_CALL ggml_backend_buffer_type_t
+ggml_backend_cann_buffer_type(int32_t device);
+
+GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
+GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
+    int32_t device, char* description, size_t description_size);
+GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device,
+                                                            size_t* free,
+                                                            size_t* total);
+void ggml_cann_backend_init(void);
+void ggml_cann_backend_free(void);
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index d895c9acdb596..2e534f3f87110 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -747,6 +747,8 @@ extern "C" {
     GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
     GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
 
+    GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
     // use this to compute the memory overhead of a tensor
     GGML_API size_t ggml_tensor_overhead(void);
 
@@ -2391,6 +2393,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_rpc        (void);
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
+    GGML_API int ggml_cpu_has_cann       (void);
 
     //
     // Internal types and functions exposed for tests and benchmarks
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index d0f4097d8cd0c..cd226835caa89 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -737,6 +737,74 @@ if (GGML_CPU_HBM)
     target_link_libraries(ggml PUBLIC memkind)
 endif()
 
+if (LLAMA_CANN)
+    if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
+        set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
+        message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
+    endif()
+
+    if (CANN_INSTALL_DIR)
+        # Only Support Linux.
+        if (LLAMA_CANN)
+            if (NOT UNIX)
+                set(LLAMA_CANN OFF)
+                message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off LLAMA_CANN")
+            endif()
+        endif()
+
+        # Supported platforms: x86-64, arm64
+        if (LLAMA_CANN)
+            if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+            elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
+            else()
+                set(LLAMA_CANN OFF)
+                message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off LLAMA_CANN")
+            endif()
+        endif()
+
+        # Set header and libs
+        if(LLAMA_CANN)
+            set(CANN_INCLUDE_DIRS 
+                ${CANN_INSTALL_DIR}/include
+                ${CANN_INSTALL_DIR}/include/aclnn
+                ${CANN_INSTALL_DIR}/acllib/include
+            )
+
+            # TODO: find libs
+            link_directories(
+                ${CANN_INSTALL_DIR}/lib64
+            )
+
+            add_subdirectory(ggml-cann/kernels)
+            list(APPEND CANN_LIBRARIES
+                ascendcl
+                nnopbase
+                opapi
+                acl_op_compiler
+                ascendc_kernels
+            )
+
+            set(GGML_HEADERS_CANN "../include/ggml-cann.h")
+            file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp")
+            list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
+
+            message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
+            message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
+
+            set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${CANN_LIBRARIES} )
+            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
+            list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
+        endif()
+    else()
+        set(LLAMA_CANN OFF)
+        message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off LLAMA_CANN")
+    endif()
+
+    if(NOT LLAMA_CANN)
+        message(WARNING "CANN: LLAMA_CANN is turned OFF, see above for details.")
+    endif()
+endif()
+
 function(get_flags CCID CCVER)
     set(C_FLAGS "")
     set(CXX_FLAGS "")
@@ -1151,6 +1219,7 @@ add_library(ggml
             ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
             ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
             ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
+            ${GGML_SOURCES_CANN}      ${GGML_HEADERS_CANN}
             )
 
 if (EMSCRIPTEN)
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 13c71c310c446..fb72060a8ac74 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -445,6 +445,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
     extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
     ggml_backend_kompute_reg_devices();
 #endif
+
+#ifdef GGML_USE_CANN
+    extern GGML_CALL int ggml_backend_cann_reg_devices(void);
+    ggml_backend_cann_reg_devices();
+#endif
 }
 
 GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
new file mode 100644
index 0000000000000..be9bfa51dccb2
--- /dev/null
+++ b/ggml/src/ggml-cann.cpp
@@ -0,0 +1,1128 @@
+#include "ggml-cann.h"
+
+#include <acl/acl.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <mutex>
+
+#include "ggml-backend-impl.h"
+#include "ggml-cann/acl_ops.h"
+#include "ggml-cann/aclnn_ops.h"
+#include "ggml-cann/common.h"
+
+[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
+                                  const char* file, int line, const char* msg) {
+    int32_t id = -1;
+    aclrtGetDevice(&id);
+
+    fprintf(stderr, "CANN error: %s\n", msg);
+    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func,
+            file, line);
+    fprintf(stderr, "  %s\n", stmt);
+    // abort with GGML_ASSERT to get a stack trace
+    GGML_ASSERT(!"CANN error");
+}
+
+void ggml_cann_set_device(const int32_t device) {
+    // TODO: uncomment these lines after empty context has fixed.
+    // int current_device;
+    // ACL_CHECK(aclrtGetDevice(&current_device));
+
+    // if (device == current_device) {
+    //   return;
+    // }
+    ACL_CHECK(aclrtSetDevice(device));
+}
+
+int32_t ggml_cann_get_device() {
+    int32_t id;
+    ACL_CHECK(aclrtGetDevice(&id));
+    return id;
+}
+
+static ggml_cann_device_info ggml_cann_init() {
+    ggml_cann_device_info info = {};
+
+    aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
+
+    if (err != ACL_SUCCESS) {
+        fprintf(stderr, "%s: failed to initialize " GGML_CANN_NAME ": %s\n",
+                __func__, aclGetRecentErrMsg());
+        return info;
+    }
+
+    GGML_ASSERT(info.device_count <= GGML_CANN_MAX_DEVICES);
+
+    // TODO: add more device info later.
+    return info;
+}
+
+const ggml_cann_device_info& ggml_cann_info() {
+    static ggml_cann_device_info info = ggml_cann_init();
+    return info;
+}
+
+// cann buffer
+struct ggml_backend_cann_buffer_context {
+    int32_t device;
+    void* dev_ptr = nullptr;
+    std::string name;
+    std::vector<void*> dev_extra_ptrs;
+
+    ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
+        : device(device),
+          dev_ptr(dev_ptr),
+          name(GGML_CANN_NAME + std::to_string(device)) {}
+
+    void* get_extra_ptr(size_t size) {
+        void* buffer;
+        ACL_CHECK(aclrtMalloc(&buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        dev_extra_ptrs.push_back(buffer);
+        return buffer;
+    }
+
+    ~ggml_backend_cann_buffer_context() {
+        ACL_CHECK(aclrtFree(dev_ptr));
+        for (auto dev_extra_ptr : dev_extra_ptrs) {
+            ACL_CHECK(aclrtFree(dev_extra_ptr));
+        }
+    }
+};
+
+GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
+    ggml_backend_buffer_t buffer) {
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+    return ctx->name.c_str();
+}
+
+GGML_CALL static bool ggml_backend_buffer_is_cann(
+    ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
+}
+
+GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
+    ggml_backend_buffer_t buffer) {
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+    delete ctx;
+}
+
+GGML_CALL static void* ggml_backend_cann_buffer_get_base(
+    ggml_backend_buffer_t buffer) {
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+    return ctx->dev_ptr;
+}
+
+GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
+                                                       const void* src,
+                                                       void* dst) {
+    GGML_ASSERT(tensor->op == GGML_OP_NONE);
+
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK4_0;
+    size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+
+    uint8_t* quant_offset = (uint8_t*)dst;
+    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+
+    for (int i = 0; i < groups; i++) {
+        const block_q4_0* group = (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0));
+        *scale_offset = group->d;
+        scale_offset++;
+
+        // 0-15
+        for (int j = 0; j < QK4_0 / 2; j += 2) {
+            (*quant_offset) = (group->qs[j] & 0x0F);
+            (*quant_offset) |= ((group->qs[j + 1] << 4));
+            quant_offset++;
+        }
+
+        // 16-31
+        for (int j = 0; j < QK4_0 / 2; j += 2) {
+            (*quant_offset) = (group->qs[j] >> 4);
+            (*quant_offset) |= (group->qs[j + 1] & 0xF0);
+            quant_offset++;
+        }
+    }
+
+    // put (uint4b_t -8) into int4b_t
+    for (quant_offset = (uint8_t*)dst;
+         quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
+        (*quant_offset) ^= 0x88;
+    }
+}
+
+GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
+    const ggml_tensor* tensor, void* src, void* dst) {
+    GGML_ASSERT(tensor->op == GGML_OP_NONE);
+
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK4_0;
+    size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+
+    uint8_t* quant_offset = (uint8_t*)src;
+    uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
+
+    for (;quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
+        (*quant_offset) ^= 0x88;
+    }
+    quant_offset = (uint8_t*)src;
+
+    for (int i = 0; i < groups; i++) {
+        block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
+        group->d = *scale_offset;
+        scale_offset++;
+
+        // 0-15
+        for (int j = 0; j < QK4_0 / 2; j += 2) {
+            group->qs[j] = ((*quant_offset) & 0x0F);
+            group->qs[j + 1] = ((*quant_offset) >> 4);
+            quant_offset++;
+        }
+
+        // 16-31
+        for (int j = 0; j < QK4_0 / 2; j += 2) {
+            group->qs[j] |= ((*quant_offset) << 4);
+            group->qs[j + 1] |= ((*quant_offset) & 0xF0);
+            quant_offset++;
+        }
+    }
+}
+
+GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
+                                                       const void* src,
+                                                       void* dst) {
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK8_0;
+    size_t quant_bytes = n_elems * sizeof(uint8_t);
+
+    uint8_t* quant_offset = (uint8_t*)dst;
+    uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+
+    for (int i = 0; i < groups; i++) {
+        const block_q8_0* group = (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0));
+        *scale_offset = group->d;
+        scale_offset++;
+        size_t group_quant_size = QK8_0 * sizeof(uint8_t);
+        memcpy(quant_offset, group->qs, group_quant_size);
+        quant_offset += group_quant_size;
+    }
+}
+
+GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
+    const ggml_tensor* tensor, const void* src, void* dst) {
+    int64_t n_elems = ggml_nelements(tensor);
+    int64_t groups = n_elems / QK8_0;
+    size_t quant_bytes = n_elems * sizeof(uint8_t);
+
+    const uint8_t* quant_offset = (const uint8_t*)src;
+    const uint16_t* scale_offset = (const uint16_t*)((const char*)src + quant_bytes);
+
+    for (int i = 0; i < groups; i++) {
+        block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
+        group->d = *scale_offset;
+        scale_offset++;
+        size_t group_quant_size = QK8_0 * sizeof(uint8_t);
+        memcpy(group->qs, quant_offset, group_quant_size);
+        quant_offset += group_quant_size;
+    }
+}
+
+GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
+                                                  const void* src, void* dst) {
+    switch (tensor->type) {
+        case GGML_TYPE_Q4_0:
+            ggml_backend_cann_transform_q4_0(tensor, src, dst);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_backend_cann_transform_q8_0(tensor, src, dst);
+            break;
+        default:
+            break;
+    }
+}
+
+GGML_CALL static void ggml_backend_cann_transform_back(
+    const ggml_tensor* tensor, void* src, void* dst) {
+    switch (tensor->type) {
+        case GGML_TYPE_Q4_0:
+            ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_backend_cann_transform_back_q8_0(tensor, src, dst);
+            break;
+        default:
+            break;
+    }
+}
+
+GGML_CALL static bool need_transform(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q8_0:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static void set_tensor_extra(ggml_backend_buffer_t buffer,
+                             ggml_tensor* tensor) {
+    // if tensor is need transform, make sure all meta data are copied to
+    // npu.
+    // TODO: All tensors should copy meta data to npu, but extra is used to
+    // record memory usage. Only used for perf test.
+    size_t tensor_meta_size = sizeof(ggml_tensor);
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+    tensor->extra = ctx->get_extra_ptr(tensor_meta_size);
+    ACL_CHECK(aclrtMemcpy(tensor->extra, tensor_meta_size, tensor,
+                          tensor_meta_size, ACL_MEMCPY_HOST_TO_DEVICE));
+}
+
+GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
+    ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+        set_tensor_extra(buffer, tensor);
+        return;
+    }
+
+    // TODO: can backend doesn't support quantized yet. Just leave the code
+    // here.
+    if (ggml_is_quantized(tensor->type)) {
+        // Initialize padding to 0 to avoid possible NaN values
+        size_t original_size = ggml_nbytes(tensor);
+        size_t padded_size =
+            ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            size_t memset_size = padded_size - original_size;
+            ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
+                                  memset_size, 0, memset_size));
+        }
+    }
+    set_tensor_extra(buffer, tensor);
+}
+
+// TODO: need handle tensor which pas paddings.
+GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
+    ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data,
+    size_t offset, size_t size) {
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+
+    ggml_cann_set_device(ctx->device);
+    // TODO: refer to cann(#6017), it use thread's default stream.
+    // For acl, synchronous functions use this default stream.
+    // Why aclrtSynchronizeDevice?
+
+    if (!need_transform(tensor->type)) {
+        ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset, size,
+                              ACL_MEMCPY_HOST_TO_DEVICE));
+    } else {
+        void* transform_buffer = malloc(size);
+        ggml_backend_cann_transform(tensor, (const char*)data + offset,
+                                    transform_buffer);
+
+#ifndef NDEBUG
+        void* check_buffer = malloc(size);
+        ggml_backend_cann_transform_back(tensor, transform_buffer,
+                                         check_buffer);
+        GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) == 0);
+        free(check_buffer);
+#endif
+        ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size,
+                              ACL_MEMCPY_HOST_TO_DEVICE));
+        free(transform_buffer);
+    }
+}
+
+GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
+    ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
+    size_t offset, size_t size) {
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+
+    ggml_cann_set_device(ctx->device);
+
+    if (!need_transform(tensor->type)) {
+        ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size,
+                              ACL_MEMCPY_DEVICE_TO_HOST));
+    } else {
+        void* transform_buffer = malloc(size);
+        ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size,
+                              ACL_MEMCPY_DEVICE_TO_HOST));
+        ggml_backend_cann_transform_back(tensor, transform_buffer,
+                                         (char*)data + offset);
+        free(transform_buffer);
+    }
+}
+
+GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
+    ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
+    if (ggml_backend_buffer_is_cann(src->buffer)) {
+        ggml_backend_cann_buffer_context* src_ctx =
+            (ggml_backend_cann_buffer_context*)src->buffer->context;
+        ggml_backend_cann_buffer_context* dst_ctx =
+            (ggml_backend_cann_buffer_context*)buffer->context;
+
+        size_t memcpy_size = ggml_nbytes(src);
+        // Same device.
+        if (src_ctx->device == dst_ctx->device) {
+            ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
+                                  (const char*)src->data, memcpy_size,
+                                  ACL_MEMCPY_DEVICE_TO_DEVICE));
+            return true;
+        } else {
+            // Different device but can access by peer.
+            int32_t canAccessPeer = 0;
+            ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
+                                               dst_ctx->device));
+            if (canAccessPeer) {
+                ggml_cann_set_device(src_ctx->device);
+                ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
+                ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
+                                      (const char*)src->data, memcpy_size,
+                                      ACL_MEMCPY_DEVICE_TO_DEVICE));
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+GGML_CALL static void ggml_backend_cann_buffer_clear(
+    ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_cann_buffer_context* ctx =
+        (ggml_backend_cann_buffer_context*)buffer->context;
+
+    ggml_cann_set_device(ctx->device);
+    ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
+}
+
+static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
+    /* .get_name        = */ ggml_backend_cann_buffer_get_name,
+    /* .free_buffer     = */ ggml_backend_cann_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cann_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_cann_buffer_init_tensor,
+    /* .set_tensor      = */ ggml_backend_cann_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cann_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_cann_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_cann_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// cann buffer type
+struct ggml_backend_cann_buffer_type_context {
+    int32_t device;
+    std::string name;
+};
+
+GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
+    ggml_backend_buffer_type_t buft) {
+    ggml_backend_cann_buffer_type_context* ctx =
+        (ggml_backend_cann_buffer_type_context*)buft->context;
+
+    return ctx->name.c_str();
+}
+
+GGML_CALL static ggml_backend_buffer_t
+ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                           size_t size) {
+    ggml_backend_cann_buffer_type_context* buft_ctx =
+        (ggml_backend_cann_buffer_type_context*)buft->context;
+
+    ggml_cann_set_device(buft_ctx->device);
+
+    size = std::max(size, (size_t)1);
+
+    void* dev_ptr;
+    aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+    if (err != ACL_SUCCESS) {
+        fprintf(
+            stderr,
+            "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
+            __func__, size / 1024.0 / 1024.0, buft_ctx->device,
+            aclGetRecentErrMsg());
+        return nullptr;
+    }
+
+    ggml_backend_cann_buffer_context* ctx =
+        new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
+
+    return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
+                                    ctx, size);
+}
+
+GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
+    ggml_backend_buffer_type_t buft) {
+    return 128;
+
+    GGML_UNUSED(buft);
+}
+
+GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size(
+    ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
+    size_t size = ggml_nbytes(tensor);
+    int64_t ne0 = tensor->ne[0];
+
+    // last line must bigger than 32, because every single op deal at
+    // least 32 bytes.
+    // TODO: quantized type?
+    // int64_t line_size = ne0 * ggml_element_size(tensor);
+    // int64_t line_size_align_32 = (line_size + 31) & ~31;
+    // size += (line_size_align_32 - line_size);
+
+    // TODO: not support quantized yet.
+    // TODO: consider un-continue tensor.
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(
+                tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return size;
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_cann_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_cann_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_cann_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL,  // defaults to SIZE_MAX
+    /* .get_alloc_size   = */ ggml_backend_cann_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
+
+GGML_CALL ggml_backend_buffer_type_t
+ggml_backend_cann_buffer_type(int32_t device) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (device >= ggml_backend_cann_get_device_count()) {
+        return nullptr;
+    }
+
+    static ggml_backend_buffer_type
+        ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
+
+    static bool ggml_backend_cann_buffer_type_initialized = false;
+
+    if (!ggml_backend_cann_buffer_type_initialized) {
+        for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
+            ggml_backend_cann_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_cann_buffer_type_interface,
+                /* .context  = */
+                new ggml_backend_cann_buffer_type_context{
+                    i, GGML_CANN_NAME + std::to_string(i)},
+            };
+        }
+        ggml_backend_cann_buffer_type_initialized = true;
+    }
+
+    return &ggml_backend_cann_buffer_types[device];
+}
+
+static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
+                                      struct ggml_tensor* dst) {
+    switch (dst->op) {
+        case GGML_OP_REPEAT:
+            ggml_cann_repeat(ctx, dst);
+            break;
+        case GGML_OP_GET_ROWS:
+            ggml_cann_get_rows(ctx, dst);
+            break;
+        case GGML_OP_DUP:
+            ggml_cann_dup(ctx, dst);
+            break;
+        case GGML_OP_ADD:
+            ggml_cann_add(ctx, dst);
+            break;
+        case GGML_OP_ACC:
+            ggml_cann_acc(ctx, dst);
+            break;
+        case GGML_OP_MUL:
+            ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
+            break;
+        case GGML_OP_DIV:
+            ggml_cann_mul_div<aclnnDivGetWorkspaceSize, aclnnDiv>(ctx, dst);
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_GELU:
+                    ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
+                        ctx, dst);
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    ggml_cann_activation<aclnnSiluGetWorkspaceSize, aclnnSilu>(
+                        ctx, dst);
+                    break;
+                // TODO: Use faster gelu??
+                case GGML_UNARY_OP_GELU_QUICK:
+                    ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
+                        ctx, dst);
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    ggml_cann_activation<aclnnTanhGetWorkspaceSize, aclnnTanh>(
+                        ctx, dst);
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    ggml_cann_activation<aclnnReluGetWorkspaceSize, aclnnRelu>(
+                        ctx, dst);
+                    break;
+                case GGML_UNARY_OP_HARDSIGMOID:
+                    ggml_cann_activation<aclnnHardsigmoidGetWorkspaceSize,
+                                         aclnnHardsigmoid>(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_HARDSWISH:
+                    ggml_cann_activation<aclnnHardswishGetWorkspaceSize,
+                                         aclnnHardswish>(ctx, dst);
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            ggml_cann_norm(ctx, dst);
+            break;
+        case GGML_OP_GROUP_NORM:
+            ggml_cann_group_norm(ctx, dst);
+            break;
+        case GGML_OP_CONCAT:
+            ggml_cann_concat(ctx, dst);
+            break;
+        case GGML_OP_UPSCALE:
+            ggml_cann_upsample_nearest2d(ctx, dst);
+            break;
+        case GGML_OP_PAD:
+            ggml_cann_pad(ctx, dst);
+            break;
+        case GGML_OP_ARANGE:
+            ggml_cann_arange(ctx, dst);
+            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            ggml_cann_timestep_embedding(ctx, dst);
+            break;
+        case GGML_OP_LEAKY_RELU:
+            ggml_cann_leaky_relu(ctx, dst);
+            break;
+        case GGML_OP_RMS_NORM:
+            ggml_cann_rms_norm(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT:
+            ggml_cann_mul_mat(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            return false;
+        case GGML_OP_SCALE:
+            ggml_cann_scale(ctx, dst);
+            break;
+        case GGML_OP_SQR:
+            ggml_cann_sqr(ctx, dst);
+            break;
+        case GGML_OP_CLAMP:
+            ggml_cann_clamp(ctx, dst);
+            break;
+        case GGML_OP_CPY:
+            ggml_cann_cpy(ctx, dst);
+            break;
+        case GGML_OP_CONT:
+            ggml_cann_dup(ctx, dst);
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            // Do nothing with these ops.
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            ggml_cann_diag_mask(ctx, dst, -INFINITY);
+            break;
+        case GGML_OP_SOFT_MAX:
+            ggml_cann_softmax(ctx, dst);
+            break;
+        case GGML_OP_ROPE:
+            ggml_cann_rope(ctx, dst);
+            break;
+        case GGML_OP_IM2COL:
+            ggml_cann_im2col(ctx, dst);
+            break;
+        case GGML_OP_POOL_2D:
+            ggml_cann_pool2d(ctx, dst);
+            break;
+        case GGML_OP_SUM_ROWS:
+            ggml_cann_sum_rows(ctx, dst);
+            break;
+        case GGML_OP_ARGSORT:
+            ggml_cann_argsort(ctx, dst);
+            break;
+        default:
+            return false;
+    }
+
+    return true;
+}
+
+// backend
+GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    return cann_ctx->name.c_str();
+}
+
+GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+    ACL_CHECK(aclrtSynchronizeDevice());
+    cann_ctx->free_device_buffers();
+    ACL_CHECK(aclrtResetDevice(cann_ctx->device));
+    delete cann_ctx;
+    delete backend;
+}
+
+GGML_CALL static ggml_backend_buffer_type_t
+ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    return ggml_backend_cann_buffer_type(cann_ctx->device);
+}
+
+GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
+                                                         ggml_tensor* tensor,
+                                                         const void* data,
+                                                         size_t offset,
+                                                         size_t size) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    if (!need_transform(tensor->type)) {
+        ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, (const char*)data + offset,
+                                   size, ACL_MEMCPY_HOST_TO_DEVICE,
+                                   cann_ctx->stream()));
+    } else {
+        void* transform_buffer = malloc(size);
+        ggml_backend_cann_transform(tensor, (const char*)data + offset,
+                                    transform_buffer);
+
+#ifndef NDEBUG
+        void* check_buffer = malloc(size);
+        ggml_backend_cann_transform_back(tensor, transform_buffer,
+                                         check_buffer);
+        GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size));
+        free(check_buffer);
+#endif
+        ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size,
+                                   ACL_MEMCPY_HOST_TO_DEVICE,
+                                   cann_ctx->stream()));
+        ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
+        free(transform_buffer);
+    }
+}
+
+GGML_CALL static void ggml_backend_cann_get_tensor_async(
+    ggml_backend_t backend, const ggml_tensor* tensor, void* data,
+    size_t offset, size_t size) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+    ggml_backend_buffer_t buf =
+        tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
+                "unsupported buffer type");
+
+    if (!need_transform(tensor->type)) {
+        ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data,
+                                   size, ACL_MEMCPY_DEVICE_TO_HOST,
+                                   cann_ctx->stream()));
+    } else {
+        void* transform_buffer = malloc(size);
+        ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size,
+                                   ACL_MEMCPY_DEVICE_TO_HOST,
+                                   cann_ctx->stream()));
+        ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
+        ggml_backend_cann_transform_back(tensor, transform_buffer,
+                                         (char*)data + offset);
+        free(transform_buffer);
+    }
+}
+
+GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
+    ggml_backend_t backend_src, ggml_backend_t backend_dst,
+    const ggml_tensor* src, ggml_tensor* dst) {
+    GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
+                ggml_backend_is_cann(backend_dst));
+
+    if (!ggml_backend_buffer_is_cann(src->buffer) ||
+        !ggml_backend_buffer_is_cann(dst->buffer)) {
+        return false;
+    }
+
+    ggml_backend_buffer_t buf_src =
+        src->view_src ? src->view_src->buffer : src->buffer;
+    ggml_backend_buffer_t buf_dst =
+        dst->view_src ? dst->view_src->buffer : dst->buffer;
+
+    ggml_backend_cann_context* cann_ctx_src =
+        (ggml_backend_cann_context*)backend_src->context;
+    ggml_backend_cann_context* cann_ctx_dst =
+        (ggml_backend_cann_context*)backend_dst->context;
+
+    size_t copy_size = ggml_nbytes(dst);
+    if (backend_src != backend_dst) {
+        ggml_backend_cann_buffer_context* buf_ctx_src =
+            (ggml_backend_cann_buffer_context*)buf_src->context;
+        ggml_backend_cann_buffer_context* buf_ctx_dst =
+            (ggml_backend_cann_buffer_context*)buf_dst->context;
+
+        GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
+        GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
+
+        int32_t canAccessPeer = 0;
+        ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
+                                           cann_ctx_dst->device));
+        if (!canAccessPeer) {
+            return false;
+        }
+
+        ggml_cann_set_device(cann_ctx_src->device);
+        ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
+                                   ACL_MEMCPY_DEVICE_TO_DEVICE,
+                                   cann_ctx_dst->stream()));
+
+        // record event on src stream
+        if (!cann_ctx_src->copy_event) {
+            ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event));
+        }
+
+        ACL_CHECK(
+            aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
+
+        // wait on dst stream for the copy to complete
+        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(),
+                                       cann_ctx_src->copy_event));
+    } else {
+        // src and dst are on the same backend
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
+                                   ACL_MEMCPY_DEVICE_TO_DEVICE,
+                                   cann_ctx_dst->stream()));
+    }
+
+    return true;
+}
+
+GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    ggml_cann_set_device(cann_ctx->device);
+
+    ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
+
+    // Free temp buffers binding to stream.
+    cann_ctx->free_stream_buffers(0);
+}
+
+GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
+    ggml_backend_t backend, ggml_cgraph* cgraph) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    ggml_cann_set_device(cann_ctx->device);
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor* node = cgraph->nodes[i];
+
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE ||
+            node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW ||
+            node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+
+        // if tensor is reused, free temp buffers first.
+        cann_ctx->free_tensor_buffers(node);
+        bool ok = ggml_cann_compute_forward(*cann_ctx, node);
+
+        if (!ok) {
+            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__,
+                    node->name, ggml_op_name(node->op));
+        }
+        // if not synchronize, aclrtSynchronizeStream in 
+        // ggml_backend_cann_synchronize() will raise error.
+        ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
+        GGML_ASSERT(ok);
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
+                                                    const ggml_tensor* op) {
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_HARDSIGMOID:
+                case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_TANH:
+                    return true;
+                default:
+                    return false;
+            }
+        case GGML_OP_MUL_MAT: {
+            switch (op->src[0]->type) {
+                // case GGML_TYPE_Q4_0:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_F32:
+                case GGML_TYPE_Q8_0:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_MUL_MAT_ID:
+            return false;
+        // embedding
+        case GGML_OP_GET_ROWS: {
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q8_0:
+                    return true;
+                default:
+                    return false;
+            }
+        } break;
+        case GGML_OP_CPY: {
+            switch (op->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q8_0:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+        case GGML_OP_DUP:
+        case GGML_OP_REPEAT:
+        case GGML_OP_CONCAT:
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_NORM:
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_CLAMP:
+        case GGML_OP_CONT:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ROPE:
+        case GGML_OP_IM2COL:
+        case GGML_OP_POOL_2D:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ACC:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_ARANGE:
+        case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_LEAKY_RELU:
+            return true;
+        default:
+            return false;
+    }
+
+    GGML_UNUSED(backend);
+}
+
+static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
+}
+
+GGML_CALL static bool ggml_backend_cann_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+    if (ggml_backend_buft_is_cann(buft)) {
+        ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *)backend->context;
+        ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *)buft->context;
+        return buft_ctx->device == cann_ctx->device;
+    }
+
+    return false;
+}
+
+GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
+                                                   const ggml_tensor* op) {
+    const int min_batch_size = 32;
+    GGML_UNUSED(backend);
+
+    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
+}
+
+static ggml_backend_event_t ggml_backend_cann_event_new(
+    ggml_backend_t backend) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    ggml_cann_set_device(cann_ctx->device);
+
+    aclrtEvent event;
+    ACL_CHECK(aclrtCreateEvent(&event));
+
+    return new ggml_backend_event{
+        /* .backend = */ backend,
+        /* .context = */ event,
+    };
+}
+
+static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
+    ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
+
+    delete event;
+}
+
+static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)event->backend->context;
+
+    ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
+}
+
+static void ggml_backend_cann_event_wait(ggml_backend_t backend,
+                                         ggml_backend_event_t event) {
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)backend->context;
+
+    if (ggml_backend_is_cann(event->backend)) {
+        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
+                                       (aclrtEvent)event->context));
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
+    ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
+}
+
+static ggml_backend_i ggml_backend_cann_interface = {
+    /* .get_name                = */ ggml_backend_cann_name,
+    /* .free                    = */ ggml_backend_cann_free,
+    /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
+    /* .set_tensor_async        = */ ggml_backend_cann_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_cann_get_tensor_async,
+    /* .cpy_tensor_async        = */ ggml_backend_cann_cpy_tensor_async,
+    /* .synchronize             = */ ggml_backend_cann_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_cann_graph_compute,
+    /* .supports_op             = */ ggml_backend_cann_supports_op,
+    /* .supports_buft           = */ ggml_backend_cann_supports_buft,
+    /* .offload_op              = */ NULL,
+    /* .event_new               = */ ggml_backend_cann_event_new,
+    /* .event_free              = */ ggml_backend_cann_event_free,
+    /* .event_record            = */ ggml_backend_cann_event_record,
+    /* .event_wait              = */ ggml_backend_cann_event_wait,
+    /* .event_synchronize       = */ ggml_backend_cann_event_synchronize,
+};
+
+static ggml_guid_t ggml_backend_cann_guid() {
+    static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
+                             0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64};
+    return &guid;
+}
+
+GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
+    if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
+        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
+        return nullptr;
+    }
+
+    ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
+    if (ctx == nullptr) {
+        fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
+        return nullptr;
+    }
+
+    ggml_backend_t cann_backend =
+        new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
+                         /* .interface = */ ggml_backend_cann_interface,
+                         /* .context   = */ ctx};
+
+    return cann_backend;
+}
+
+GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) {
+    return backend != NULL &&
+           ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
+}
+
+GGML_CALL int32_t ggml_backend_cann_get_device_count() {
+    return ggml_cann_info().device_count;
+}
+
+GGML_CALL void ggml_backend_cann_get_device_description(
+    int32_t device, char* description, size_t description_size) {
+    ggml_cann_set_device(device);
+    const char* soc_name = aclrtGetSocName();
+    snprintf(description, description_size, "%s", soc_name);
+}
+
+GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
+                                                   size_t* total) {
+    ggml_cann_set_device(device);
+    ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
+}
+
+// backend registry
+GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params,
+                                                           void* user_data) {
+    ggml_backend_t cann_backend =
+        ggml_backend_cann_init((int)(intptr_t)user_data);
+    return cann_backend;
+
+    GGML_UNUSED(params);
+}
+
+extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
+
+GGML_CALL int ggml_backend_cann_reg_devices() {
+    aclInit(nullptr);
+    uint32_t device_count = ggml_backend_cann_get_device_count();
+    // initialization
+    for (uint32_t i = 0; i < device_count; i++) {
+        char name[128];
+        snprintf(name, sizeof(name), "%s%d", GGML_CANN_NAME, i);
+        ggml_backend_register(name, ggml_backend_reg_cann_init,
+                              ggml_backend_cann_buffer_type(i),
+                              (void*)(intptr_t)i);
+    }
+    return device_count;
+}
+
+void ggml_cann_backend_init(void) { ACL_CHECK(aclInit(nullptr)); }
+
+void ggml_cann_backend_free(void) { ACL_CHECK(aclFinalize()); }
diff --git a/ggml/src/ggml-cann/acl_ops.cpp b/ggml/src/ggml-cann/acl_ops.cpp
new file mode 100644
index 0000000000000..fac9ea1ae4c3d
--- /dev/null
+++ b/ggml/src/ggml-cann/acl_ops.cpp
@@ -0,0 +1,80 @@
+#include "acl_ops.h"
+
+OpCaller::OpCaller() { attrs = aclopCreateAttr(); }
+
+OpCaller::~OpCaller() {
+    for (aclTensorDesc* desc : input_descs) {
+        aclDestroyTensorDesc(desc);
+    }
+    for (aclDataBuffer* buffer : input_buffers) {
+        aclDestroyDataBuffer(buffer);
+    }
+    for (aclTensorDesc* desc : output_descs) {
+        aclDestroyTensorDesc(desc);
+    }
+    for (aclDataBuffer* buffer : output_buffers) {
+        aclDestroyDataBuffer(buffer);
+    }
+    aclopDestroyAttr(attrs);
+}
+
+OpCaller& OpCaller::name(std::string _op_name) {
+    op_name = _op_name;
+    return *this;
+}
+
+OpCaller& OpCaller::input_no_contiguous(ggml_tensor* tensor, const char* name) {
+    aclDataType dtype = type_mapping(tensor->type);
+    // TODO
+    int64_t ne[] = {tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]};
+    aclTensorDesc* tensor_desc =
+        aclCreateTensorDesc(dtype, GGML_MAX_DIMS, ne, ACL_FORMAT_ND);
+    aclSetTensorDescName(tensor_desc, name);
+    input_descs.push_back(tensor_desc);
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor));
+    input_buffers.push_back(data_buffer);
+    return *this;
+}
+
+OpCaller& OpCaller::input(ggml_tensor* tensor, const char* name) {
+    GGML_ASSERT(ggml_is_contiguous(tensor));
+    return input_no_contiguous(tensor, name);
+}
+
+OpCaller& OpCaller::output(ggml_tensor* tensor, const char* name) {
+    aclDataType dtype = type_mapping(tensor->type);
+    aclTensorDesc* tensor_desc =
+        aclCreateTensorDesc(dtype, GGML_MAX_DIMS, tensor->ne, ACL_FORMAT_ND);
+    aclSetTensorDescName(tensor_desc, name);
+    output_descs.push_back(tensor_desc);
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor));
+    output_buffers.push_back(data_buffer);
+    return *this;
+}
+
+OpCaller& OpCaller::attr(int64_t value, const char* name) {
+    ACL_CHECK(aclopSetAttrInt(attrs, name, value));
+    return *this;
+}
+
+OpCaller& OpCaller::attr(bool value, const char* name) {
+    ACL_CHECK(aclopSetAttrBool(attrs, name, value));
+    return *this;
+}
+
+OpCaller& OpCaller::attr(float value, const char* name) {
+    ACL_CHECK(aclopSetAttrFloat(attrs, name, value));
+    return *this;
+}
+
+OpCaller& OpCaller::run(aclrtStream stream) {
+    ACL_CHECK(aclSetCompileopt(ACL_OP_JIT_COMPILE, "disable"));
+    ACL_CHECK(aclopCompileAndExecute(
+        op_name.c_str(), input_descs.size(), input_descs.data(),
+        input_buffers.data(), output_buffers.size(), output_descs.data(),
+        output_buffers.data(), attrs, ACL_ENGINE_SYS, ACL_COMPILE_SYS, nullptr,
+        stream));
+    return *this;
+}
diff --git a/ggml/src/ggml-cann/acl_ops.h b/ggml/src/ggml-cann/acl_ops.h
new file mode 100644
index 0000000000000..740105c873afe
--- /dev/null
+++ b/ggml/src/ggml-cann/acl_ops.h
@@ -0,0 +1,72 @@
+#ifndef CANN_ACL_OPS
+#define CANN_ACL_OPS
+
+#include <acl/acl_op.h>
+#include <acl/acl_op_compiler.h>
+
+#include <string>
+#include <vector>
+
+#include "acl_tensor.h"
+#include "common.h"
+
+struct OpCaller {
+    std::string op_name;
+    std::vector<aclTensorDesc*> input_descs;
+    std::vector<aclDataBuffer*> input_buffers;
+    std::vector<aclTensorDesc*> output_descs;
+    std::vector<aclDataBuffer*> output_buffers;
+    aclopAttr* attrs;
+    std::vector<void*> ptrs;
+
+    OpCaller();
+
+    virtual ~OpCaller();
+
+    OpCaller& name(std::string _op_name);
+
+    OpCaller& input_no_contiguous(ggml_tensor* tensor, const char* name);
+
+    OpCaller& input(ggml_tensor* tensor, const char* name);
+
+    OpCaller& output(ggml_tensor* tensor, const char* name);
+
+    OpCaller& attr(int64_t value, const char* name);
+
+    OpCaller& attr(bool value, const char* name);
+
+    OpCaller& attr(float value, const char* name);
+
+    template <typename T>
+    OpCaller& input(ggml_backend_cann_context& ctx, ggml_tensor *dst, T* values,
+                    aclDataType dtype, size_t dims, int64_t* dim,
+                    const char* name, aclrtStream stream = nullptr) {
+        size_t n_elem = 1;
+        for (size_t i = 0; i < dims; i++) {
+            n_elem *= dim[i];
+        }
+
+        size_t n_bytes = n_elem * sizeof(T);
+        void* device_ptr = ctx.alloc_buffer(dst, n_bytes);
+        if (stream == nullptr) {
+            ACL_CHECK(aclrtMemcpy(device_ptr, n_bytes, values, n_bytes,
+                                  ACL_MEMCPY_HOST_TO_DEVICE));
+        } else {
+            ACL_CHECK(aclrtMemcpyAsync(device_ptr, n_bytes, values, n_bytes,
+                                       ACL_MEMCPY_HOST_TO_DEVICE, stream));
+        }
+
+        aclTensorDesc* tensor_desc =
+            aclCreateTensorDesc(dtype, dims, dim, ACL_FORMAT_ND);
+        aclSetTensorDescName(tensor_desc, name);
+        input_descs.push_back(tensor_desc);
+        aclDataBuffer* data_buffer = aclCreateDataBuffer(device_ptr, n_bytes);
+        input_buffers.push_back(data_buffer);
+
+        return *this;
+    }
+
+    OpCaller& run(aclrtStream stream = nullptr);
+};
+
+#endif  // CANN_ACL_OPS
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp
new file mode 100644
index 0000000000000..1b9cc5b351efc
--- /dev/null
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -0,0 +1,204 @@
+#include "acl_tensor.h"
+
+#include <algorithm>
+#include <cstring>
+
+/**
+ * Mapping ggml_tensor type to acl_tensor type.
+ */
+aclDataType type_mapping(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_F32:
+            return ACL_FLOAT;
+        case GGML_TYPE_F16:
+            return ACL_FLOAT16;
+        case GGML_TYPE_I8:
+            return ACL_INT8;
+        case GGML_TYPE_I16:
+            return ACL_INT16;
+        case GGML_TYPE_I32:
+            return ACL_INT32;
+        default:
+            return ACL_DT_UNDEFINED;
+    }
+    return ACL_DT_UNDEFINED;
+}
+
+static bool nb3_is_valid(const ggml_tensor* tensor) {
+    // check tensor->nb[3] is contiguous by ne.
+    if (tensor->nb[3] == tensor->ne[0] * tensor->ne[1] * tensor->ne[2] 
+                         * ggml_element_size(tensor)) {
+        return true;
+    }
+    else {
+        return false;
+    }
+}
+
+/**
+ * Transform ggml_tensor to acl_tensor. Note that ggml_tensor dimension order
+ * is reversed compared to acl_tensor.
+ *
+ * If bcast_ne and bcast_nb is nullptr, use ggml_tensor's ne and nb.
+ * otherwise, use bcast_ne bcast_nb, which means tensor dims should be
+ * changed to satisfy the broadcast. @sa: get_bcast_shape.
+ */
+aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
+                             size_t* bcast_nb, int64_t bcast_dims,
+                             aclFormat format, size_t offset) {
+    // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
+    // added.
+    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
+    int64_t acl_storage_ne[GGML_MAX_DIMS * 2];
+    if (bcast_ne == nullptr) {
+        for (int i = 0; i < GGML_MAX_DIMS; i++) {
+            acl_ne[i] = tensor->ne[i];
+            // The step size of acl is in elements.
+            acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
+            acl_storage_ne[i] = acl_ne[i];
+        }
+        if (!nb3_is_valid(tensor)) {
+            if (tensor->ne[GGML_MAX_DIMS-1] == 1) {
+                if (tensor->nb[2] == tensor->nb[0]*tensor->ne[0] &&
+                    tensor->nb[1] == tensor->nb[2]*tensor->ne[2]) {
+                    // nb[3] not valid, tensor is not contiguous by permuted to 
+                    // (0,2,1,3), still use tensor->ne. 
+                    // @see https://github.com/ggerganov/llama.cpp/issues/7930.
+                    for (int i = 0;  i < GGML_MAX_DIMS; i++) {
+                        acl_storage_ne[i] = acl_ne[i];
+                    }
+                }
+                else {
+                    // nb[3] is valid but tensor is not contiguous.
+                    // e.g. nb=(2,1024,121072,1048576), ne=(32,128,8,1) with 
+                    // fp16, 1024/2 not equal to 32.
+                    // acl_storage_ne should be decided by tensor->nb.
+                    for (int i = 0;  i < GGML_MAX_DIMS-1; i++) {
+                        acl_storage_ne[i] = std::max(static_cast<int64_t>(1), 
+                                            acl_stride[i+1] / acl_stride[i]);
+                    }
+                    acl_storage_ne[GGML_MAX_DIMS-1] = 
+                                            tensor->ne[GGML_MAX_DIMS-1];
+                }
+            }
+            else {
+                // not impl
+                GGML_ASSERT(false);
+            }
+        }
+    } else {
+        // With bcast
+        for (int i = 0; i < bcast_dims; i++) {
+            acl_ne[i] = bcast_ne[i];
+            acl_stride[i] = bcast_nb[i] / ggml_element_size(tensor);
+            acl_storage_ne[i] = acl_ne[i];
+        }
+    }
+
+    int64_t dims = (bcast_dims == 0 ? GGML_MAX_DIMS : bcast_dims);
+    std::reverse(acl_ne, acl_ne + dims);
+    std::reverse(acl_stride, acl_stride + dims);
+    std::reverse(acl_storage_ne, acl_storage_ne + dims);
+
+    aclTensor* acl_tensor = aclCreateTensor(
+        acl_ne, dims, type_mapping(tensor->type), acl_stride,
+        offset / ggml_element_size(tensor), format, acl_storage_ne, dims, 
+        tensor->data);
+
+    return acl_tensor;
+}
+
+aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
+                             size_t type_size, int64_t* ne, size_t* nb,
+                             int64_t dims, aclFormat format, size_t offset) {
+    int64_t tmp_ne[GGML_MAX_DIMS * 2];
+    int64_t tmp_stride[GGML_MAX_DIMS * 2];
+
+    memcpy(tmp_ne, ne, dims * sizeof(int64_t));
+    for (int i = 0; i < dims; i++) {
+        tmp_stride[i] = nb[i] / type_size;
+    }
+
+    std::reverse(tmp_ne, tmp_ne + dims);
+    std::reverse(tmp_stride, tmp_stride + dims);
+
+    aclTensor* acl_tensor = aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, 
+                                            offset / type_size, format, tmp_ne, 
+                                            dims, data_ptr);
+
+    return acl_tensor;
+}
+
+/**
+ * Add extra dims to satisfy acl kernel's broadcast rules (same as numpy).
+ * ggml_tensor dimension order is reversed compared to Python.
+ * bcast src1 with src0 though adding a extra dim.
+ * for example:
+ * src0 -> (32,10,10,10)
+ * src1 -> (16,10,10,10)
+ * bcast_ne_src0 -> (16,2,10,10,10)
+ * bcast_ne_src1 -> (16,1,10,10,10)
+ *
+ * if dim0 has padding.
+ * a -> (2, 2) padding = 2
+ *  a: [[1, 2, *, *]
+ *      [2, 3, *, *]]
+ * nb = (8, 4, 2)
+ *
+ * if a should bcast with b -> (2, 4)
+ * b' -> (2, 2, 2)
+ * b : [[1, 2, 3, 4, *, *]
+ *      [5, 6, 7, 8, *, *]]
+ * nb = (12, 6, 1)
+ *
+ * after bcast:
+ * a' -> (2, 1, 2)
+ * a': [[[1, 2], *, *]
+ *      [[2, 3], *, *]]
+ * nb = (8, 4, 2, 1)
+ *
+ * b' : [[[1, 2], [3, 4], *, *]
+ *       [[5, 6], [7, 8], *, *]]
+ * nb = (12, 6, 2, 1)
+ *
+ * because dim1 in a inserted dim, should add nb for dim1,
+ * and all other nb moves to next in order.
+ */
+int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
+                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
+                        size_t* bcast_nb_src0, size_t* bcast_nb_src1) {
+    GGML_ASSERT(ggml_can_repeat(src1, src0));
+    int bcast_dim_cnt = 0;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        int64_t nr = src0->ne[i] / src1->ne[i];
+        bcast_ne_src0[bcast_dim_cnt] = src0->ne[i] / nr;
+        bcast_ne_src1[bcast_dim_cnt] = src1->ne[i];
+        bcast_nb_src0[bcast_dim_cnt] = src0->nb[i];
+        bcast_nb_src1[bcast_dim_cnt] = src1->nb[i];
+        bcast_dim_cnt++;
+        if (nr != 1) {
+            // Need to add an extra dim.
+            bcast_ne_src0[bcast_dim_cnt] = nr;
+            bcast_ne_src1[bcast_dim_cnt] = 1;
+            bcast_nb_src0[bcast_dim_cnt] = bcast_nb_src0[bcast_dim_cnt - 1] *
+                                           bcast_ne_src0[bcast_dim_cnt - 1];
+            bcast_nb_src1[bcast_dim_cnt] = bcast_nb_src1[bcast_dim_cnt - 1] *
+                                           bcast_ne_src1[bcast_dim_cnt - 1];
+            bcast_dim_cnt++;
+        }
+    }
+    return bcast_dim_cnt;
+}
+
+/**
+ * Check if shape are not same, and no dim equals 1.
+ * if any dim equals 1, acl kernel will do the broadcast.
+ */
+bool need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
+            return true;
+        }
+    }
+    return false;
+}
diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h
new file mode 100644
index 0000000000000..9c24c726e7c24
--- /dev/null
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -0,0 +1,38 @@
+#ifndef CANN_ACL_TENSOR_H
+#define CANN_ACL_TENSOR_H
+
+#include <aclnn/aclnn_base.h>
+
+#include "common.h"
+
+// Broadcast
+aclDataType type_mapping(ggml_type type);
+
+aclTensor* create_acl_tensor(const ggml_tensor* tensor,
+                             int64_t* bcast_ne = nullptr,
+                             size_t* bcast_nb = nullptr, int64_t bcast_dims = 0,
+                             aclFormat format = ACL_FORMAT_ND, size_t offset = 0);
+
+aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
+                             size_t type_size, int64_t* ne, size_t* nb,
+                             int64_t dims, aclFormat format = ACL_FORMAT_ND, size_t offset = 0);
+
+bool need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
+
+int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
+                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
+                        size_t* bcast_nb_src0, size_t* bcast_nb_src1);
+
+// Bcast macro to avoid duplicate code.
+#define BCAST_SHAPE(src0, src1)                                       \
+    int64_t bcast_ne_##src0[GGML_MAX_DIMS * 2];                       \
+    int64_t bcast_ne_##src1[GGML_MAX_DIMS * 2];                       \
+    size_t bcast_nb_##src0[GGML_MAX_DIMS * 2];                        \
+    size_t bcast_nb_##src1[GGML_MAX_DIMS * 2];                        \
+    int64_t bcast_dims =                                              \
+        get_bcast_shape(src0, src1, bcast_ne_##src0, bcast_ne_##src1, \
+                        bcast_nb_##src0, bcast_nb_##src1);
+
+#define BCAST_PARAM(src) bcast_ne_##src, bcast_nb_##src, bcast_dims
+
+#endif  // CANN_ACL_TENSOR_H
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
new file mode 100644
index 0000000000000..51594633d5da4
--- /dev/null
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -0,0 +1,2516 @@
+#include "aclnn_ops.h"
+
+#include <aclnnop/aclnn_avgpool2d.h>
+#include <aclnnop/aclnn_matmul.h>
+#include <aclnnop/aclnn_cast.h>
+#include <aclnnop/aclnn_constant_pad_nd.h>
+#include <aclnnop/aclnn_copy.h>
+#include <aclnnop/aclnn_cos.h>
+#include <aclnnop/aclnn_exp.h>
+#include <aclnnop/aclnn_fill_scalar.h>
+#include <aclnnop/aclnn_group_norm.h>
+#include <aclnnop/aclnn_layer_norm.h>
+#include <aclnnop/aclnn_max_pool.h>
+#include <aclnnop/aclnn_permute.h>
+#include <aclnnop/aclnn_pow_tensor_tensor.h>
+#include <aclnnop/aclnn_reduce_sum.h>
+#include <aclnnop/aclnn_repeat.h>
+#include <aclnnop/aclnn_repeat_interleave.h>
+#include <aclnnop/aclnn_roll.h>
+#include <aclnnop/aclnn_sin.h>
+#include <aclnnop/aclnn_softmax.h>
+#include <aclnnop/aclnn_tril.h>
+#include <aclnnop/aclnn_triu.h>
+#include <aclnnop/aclnn_upsample_nearest_2d.h>
+#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
+#include <float.h>
+
+#include <cmath>
+#include <cstring>
+#include <vector>
+#include <exception>
+#include "kernels/ascendc_kernels.h"
+
+static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                  aclTensor* acl_dst, int64_t* repeat_array, 
+                  ggml_tensor* bind_tensor) {
+    // repeat tensor along each dim with repeat_array
+
+    aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst,
+                                          &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+    aclrtStream stream = ctx.stream();
+    ACL_CHECK(aclnnRepeat(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclDestroyIntArray(repeats));
+}
+
+void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    GGML_ASSERT(ggml_can_repeat(src, dst));
+
+    size_t nbytes = ggml_nbytes(dst);
+    aclrtStream main_stream = ctx.stream();
+    // Set dst to a zero tensor.
+    ACL_CHECK(aclrtMemsetAsync(dst->data, nbytes, 0, nbytes, main_stream));
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
+                              dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
+
+    aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray, dst);
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
+               aclTensor* acl_src1, aclTensor* acl_dst,
+               ggml_tensor* bind_tensor) {
+    // add: dst = acl_src0 + alpha*acl_src1
+
+    aclScalar* alpha = nullptr;
+    float alphaValue = 1.0f;
+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
+                                       &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyScalar(alpha));
+}
+
+void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    aclTensor* acl_src0;
+    aclTensor* acl_src1;
+    aclTensor* acl_dst;
+
+    // Need bcast
+    if (!ggml_are_same_shape(src0, src1) && need_bcast(src0, src1)) {
+        BCAST_SHAPE(src0, src1)
+        acl_src0 = create_acl_tensor(src0, BCAST_PARAM(src0));
+        acl_src1 = create_acl_tensor(src1, BCAST_PARAM(src1));
+        acl_dst = create_acl_tensor(dst, BCAST_PARAM(src0));
+    } else {
+        acl_src0 = create_acl_tensor(src0);
+        acl_src1 = create_acl_tensor(src1);
+        acl_dst = create_acl_tensor(dst);
+    }
+
+    aclnn_add(ctx, acl_src0, acl_src1, acl_dst, dst);
+
+    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_src1));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+    aclScalar* acl_negative_slope =
+        aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnLeakyReluGetWorkspaceSize(
+        acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(
+        aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyScalar(acl_negative_slope));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+static void aclnn_concat(ggml_backend_cann_context& ctx, aclTensorList* tensorList,
+                  aclTensor* acl_dst, int64_t concat_dim,
+                  ggml_tensor* bind_tensor) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    // dims in llama.cpp is reversed.
+    ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst,
+                                       &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, main_stream));
+}
+
+void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+    aclTensor* acl_src0 = create_acl_tensor(src0);
+    aclTensor* acl_src1 = create_acl_tensor(src1);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    int64_t concat_dim = 1;
+    aclTensor* tensors[] = {acl_src0, acl_src1};
+    aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
+    aclnn_concat(ctx, tensorList, acl_dst, concat_dim, dst);
+
+    ACL_CHECK(aclDestroyTensorList(tensorList));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
+                  float start, float stop, float step, int64_t n_elements,
+                  ggml_tensor* bind_tensor) {
+    // arange: [start, stop), out(i+1) = out(i) + step.
+
+    int64_t steps = (int64_t)std::ceil((stop - start) / step);
+    GGML_ASSERT(n_elements == steps);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
+    aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
+    aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
+
+    ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst,
+                                          &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(aclnnArange(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyScalar(acl_start));
+    ACL_CHECK(aclDestroyScalar(acl_end));
+    ACL_CHECK(aclDestroyScalar(acl_step));
+}
+
+void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    int64_t n_elements = ggml_nelements(dst);
+    float start;
+    float stop;
+    float step;
+    memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
+    memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
+    memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
+
+    aclnn_arange(ctx, acl_dst, start, stop, step, n_elements, dst);
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    dst->src[1] = dst->src[0];
+    ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
+}
+
+void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // y = max(min(x, max_value), min_value).
+
+    ggml_tensor* src = dst->src[0];
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float*)dst->op_params + 1, sizeof(float));
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
+    aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst,
+                                         &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyScalar(acl_min));
+    ACL_CHECK(aclDestroyScalar(acl_max));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // acl_dst = acl_src * scale.
+
+    ggml_tensor* src = dst->src[0];
+
+    // scale factor
+    float v;
+    memcpy(&v, dst->op_params, sizeof(float));
+
+    aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT);
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize,
+                                        &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyScalar(scale));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0];
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+    void* buffer = ctx.alloc_buffer(dst, ggml_nelements(dst) * sizeof(int64_t));
+    aclTensor* tmp_tensor =
+        create_acl_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne,
+                          dst->nb, GGML_MAX_DIMS);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnArgsortGetWorkspaceSize(
+        acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor,
+        &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(
+        aclnnArgsort(workspaceAddr, workspaceSize, executor, main_stream));
+
+    workspaceSize = 0;
+    ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor, type_mapping(dst->type),
+                                        acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(tmp_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // layer_norm for one layer.
+    
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    std::vector<int64_t> normData = {dst->ne[0]};
+    aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
+    ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr,
+                                             eps, acl_dst, nullptr, nullptr,
+                                             &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream stream = ctx.stream();
+
+    ACL_CHECK(aclnnLayerNorm(workspaceAddr, workspaceSize, executor, stream));
+
+    ACL_CHECK(aclDestroyIntArray(norm));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    const float eps = 1e-6f;  // TODO: make this a parameter
+    int n_groups = dst->op_params[0];
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    int64_t N = src->ne[3];
+    int64_t C = src->ne[2];
+    int64_t HxW = src->ne[1] * src->ne[0];
+
+    size_t type_size = ggml_type_size(src->type);
+    int64_t ne[] = {n_groups, N};
+    size_t nb[] = {type_size, type_size * n_groups};
+    size_t n_bytes = N * n_groups;
+    void* buffer = ctx.alloc_buffer(dst, n_bytes * 2);
+    aclTensor* acl_mean_out =
+        create_acl_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
+    aclTensor* acl_rstd_out = create_acl_tensor(
+        (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
+
+    ACL_CHECK(aclnnGroupNormGetWorkspaceSize(
+        acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst,
+        acl_mean_out, acl_rstd_out, &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream stream = ctx.stream();
+
+    ACL_CHECK(aclnnGroupNorm(workspaceAddr, workspaceSize, executor, stream));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyTensor(acl_mean_out));
+    ACL_CHECK(aclDestroyTensor(acl_rstd_out));
+}
+
+void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // if inplace: dst = dst + alpha * src1 
+    // else:       dst = src0 + alpha * src1
+
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+
+    size_t nb1 = ((int32_t*)dst->op_params)[0];
+    size_t nb2 = ((int32_t*)dst->op_params)[1];
+    size_t nb3 = ((int32_t*)dst->op_params)[2];
+    size_t offset = ((int32_t*)dst->op_params)[3];
+    bool inplace = (bool)((int32_t*)dst->op_params)[4];
+
+    size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3};
+
+    aclTensor* acl_dst = create_acl_tensor(
+        dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
+    aclTensor* acl_src1 = create_acl_tensor(src1);
+
+    aclScalar* alpha = nullptr;
+    float alphaValue = 1.0f;
+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    aclrtStream stream = ctx.stream();
+
+    if (!inplace) {
+        size_t cpy_size = ggml_nbytes(dst);
+        ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size,
+                                   ACL_MEMCPY_DEVICE_TO_DEVICE, stream));
+        aclTensor* acl_src0 = create_acl_tensor(
+            src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
+        ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
+                                           &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+        }
+        ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, stream));
+        ACL_CHECK(aclDestroyTensor(acl_src0));
+    } else {
+        ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha,
+                                                  &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+        }
+        ACL_CHECK(
+            aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, stream));
+    }
+
+    ACL_CHECK(aclDestroyTensor(acl_src1));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // reducesum along last dim.
+
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = create_acl_tensor(src);
+
+    GGML_ASSERT(dst->ne[0] == 1);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    int64_t reduce_dims_host[] = {3};
+    aclIntArray* reduce_dims = aclCreateIntArray(reduce_dims_host, 1);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnReduceSumGetWorkspaceSize(acl_src, reduce_dims, true,
+                                             type_mapping(src->type), acl_dst,
+                                             &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream stream = ctx.stream();
+    ACL_CHECK(aclnnReduceSum(workspaceAddr, workspaceSize, executor, stream));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
+                                  ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src =
+        create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+    aclTensor* acl_dst =
+        create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+
+    std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
+    auto output_size_array = aclCreateIntArray(output_size.data(), 2);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    aclrtStream stream = ctx.stream();
+
+    ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize(
+        acl_src, output_size_array, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    ACL_CHECK(
+        aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor, stream));
+
+    ACL_CHECK(aclDestroyIntArray(output_size_array));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+static void aclnn_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst,
+               aclTensor* acl_src, aclTensor* acl_dst, int64_t* paddings,
+               float value = 0.0f) {
+    aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
+    aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnConstantPadNdGetWorkspaceSize(
+        acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream stream = ctx.stream();
+    ACL_CHECK(
+        aclnnConstantPadNd(workspaceAddr, workspaceSize, executor, stream));
+
+    ACL_CHECK(aclDestroyIntArray(acl_pad));
+    ACL_CHECK(aclDestroyScalar(acl_value));
+}
+
+void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    // padding: value in the array means how much distance will be padding.  
+    // the position of elements in the array means which dirction to padding, 
+    // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
+    //                       dim2.front, dim2.behind, dim3.front, dim3.behind]
+    int64_t paddings[] = {
+        0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
+        0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
+    aclnn_pad(ctx, dst, acl_src, acl_dst, paddings);
+
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+}
+
+void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    const int32_t* opts = (const int32_t*)dst->op_params;
+    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
+    switch (op) {
+        case GGML_OP_POOL_AVG:
+            ggml_cann_avg_pool2d(ctx, dst);
+            break;
+        case GGML_OP_POOL_MAX:
+            ggml_cann_max_pool2d(ctx, dst);
+            break;
+        case GGML_OP_POOL_COUNT:
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
+void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src =
+        create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+    aclTensor* acl_dst =
+        create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+
+    // params
+    const int32_t* opts = (const int32_t*)dst->op_params;
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+
+    std::vector<int64_t> kernel_dims = {k1, k0};
+    std::vector<int64_t> stride_dims = {s1, s0};
+    std::vector<int64_t> padding_avg_dims = {p1, p0};  // (padH, padW)
+
+    auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
+    auto* strides = aclCreateIntArray(stride_dims.data(), 2);
+    auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
+
+    bool ceil_mode = false;  //
+    bool count_include_pad = true;
+    int64_t divisor_override = 0;
+    int8_t cube_math_type = 0;
+
+    // execute op api
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    aclrtStream stream = ctx.stream();
+    ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize(
+        acl_src, kernel_size, strides, paddings_avg, ceil_mode,
+        count_include_pad, divisor_override, cube_math_type, acl_dst,
+        &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+    ACL_CHECK(aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, stream));
+
+    // release
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyIntArray(kernel_size));
+    ACL_CHECK(aclDestroyIntArray(strides));
+    ACL_CHECK(aclDestroyIntArray(paddings_avg));
+}
+
+void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src =
+        create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+    aclTensor* acl_dst =
+        create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+    // params
+    const int32_t* opts = (const int32_t*)dst->op_params;
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+
+    int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2],
+                         src->ne[3]};
+    size_t temp_nb[GGML_MAX_DIMS];
+
+    temp_nb[0] = ggml_element_size(src);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
+    }
+
+    void* buffer =
+        ctx.alloc_buffer(dst, ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
+    aclTensor* tmp_tensor =
+        create_acl_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne,
+                          temp_nb, GGML_MAX_DIMS, ACL_FORMAT_NCHW);
+
+    // pad: see padding in ggml_cann_pad()
+    int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0};
+    float value = -FLT_MAX;
+    aclnn_pad(ctx, dst, acl_src, tmp_tensor, paddings, value);
+
+    // max_pool
+    std::vector<int64_t> kernel_dims = {k1, k0};
+    std::vector<int64_t> stride_dims = {s1, s0};
+    // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
+    std::vector<int64_t> padding_max_dims = {0, 0, 0, 0};
+    std::vector<int64_t> dilation_size = {1, 1};
+    auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
+    auto* strides = aclCreateIntArray(stride_dims.data(), 2);
+    auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4);
+    auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
+
+    bool ceil_mode = false;
+    int64_t auto_pads = 0;
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+    aclrtStream stream = ctx.stream();
+
+    ACL_CHECK(aclnnMaxPoolGetWorkspaceSize(
+        tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations,
+        ceil_mode, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnMaxPool(workspaceAddr, workspaceSize, executor, stream));
+
+    // release
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyTensor(tmp_tensor));
+    ACL_CHECK(aclDestroyIntArray(kernel_size));
+    ACL_CHECK(aclDestroyIntArray(strides));
+    ACL_CHECK(aclDestroyIntArray(paddings_max));
+    ACL_CHECK(aclDestroyIntArray(dilations));
+}
+
+static void cann_copy(ggml_backend_cann_context& ctx, ggml_tensor* dst,
+               aclTensor* acl_src, aclTensor* acl_dst) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceCopyGetWorkspaceSize(acl_dst, acl_src, &workspaceSize,
+                                               &executor));
+
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream stream = ctx.stream();
+    ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, stream));
+}
+
+void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);    
+    
+    // param
+    dup_param param;
+    for (int i=0; i<4; i++) {
+        param.src_ne[i] = src->ne[i];
+        param.src_nb[i] = src->nb[i];
+        param.dst_ne[i] = dst->ne[i];
+        param.dst_nb[i] = dst->nb[i];
+    }
+
+    // TODO: simplefify
+    if (src->type==GGML_TYPE_F16) {
+        if (dst->type==GGML_TYPE_Q8_0) {
+            aclrtlaunch_ascendc_quantize_f16_q8_0(
+                24, ctx.stream(), src->data, dst->data, 
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, 
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
+        if (dst->type==GGML_TYPE_F16) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, dst, acl_src, acl_dst);
+                          ACL_CHECK(aclDestroyTensor(acl_src));
+                          ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    
+                    // param copy
+                    void *param_buffer;
+                    ACL_CHECK(aclrtMalloc(&param_buffer, sizeof(dup_param), 
+                                        ACL_MEM_MALLOC_HUGE_FIRST));
+
+                    ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), 
+                                          &param, sizeof(dup_param), 
+                                          ACL_MEMCPY_HOST_TO_DEVICE));
+                    aclrtlaunch_ascendc_dup_by_rows_fp16(rows_num, ctx.stream(), 
+                                                         src->data, dst->data, 
+                                                         param_buffer);
+                    return;
+                }
+                GGML_ASSERT(false);
+            }
+            GGML_ASSERT(false);
+        }
+        if (dst->type==GGML_TYPE_F32) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, dst, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    // param copy
+                    void *param_buffer;
+                    ACL_CHECK(aclrtMalloc(&param_buffer, sizeof(dup_param), 
+                                          ACL_MEM_MALLOC_HUGE_FIRST));
+
+                    ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), 
+                                          &param, sizeof(dup_param), 
+                                          ACL_MEMCPY_HOST_TO_DEVICE));
+                    aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(rows_num, 
+                                                                 ctx.stream(), 
+                                                                 src->data, 
+                                                                 dst->data, 
+                                                                 param_buffer);
+                    return;
+                }
+                GGML_ASSERT(false);
+            }
+            GGML_ASSERT(false);
+        }
+        // TODO
+        GGML_ASSERT(false);
+    }
+    else if (src->type==GGML_TYPE_F32) {
+        //TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size 
+        //          && nb0 == type_size)
+        if (dst->type==GGML_TYPE_Q8_0) {
+            aclrtlaunch_ascendc_quantize_f32_q8_0(
+                24, ctx.stream(), src->data, dst->data, 
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, 
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        } 
+        if (dst->type==GGML_TYPE_F32) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, dst, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    // param copy
+                    void *param_buffer;
+                    ACL_CHECK(aclrtMalloc(&param_buffer, sizeof(dup_param), 
+                                          ACL_MEM_MALLOC_HUGE_FIRST));
+
+                    ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), 
+                                          &param, sizeof(dup_param), 
+                                          ACL_MEMCPY_HOST_TO_DEVICE));
+                    aclrtlaunch_ascendc_dup_by_rows_fp32(rows_num, ctx.stream(), 
+                                                         src->data, dst->data, 
+                                                         param_buffer);
+                    return;
+                }
+                GGML_ASSERT(false);
+            }
+            else {
+                //TODO: dst not contiguous
+                GGML_ASSERT(false);
+            }
+        }
+        if (dst->type==GGML_TYPE_F16) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, dst, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    // param copy
+                    void *param_buffer;
+                    ACL_CHECK(aclrtMalloc(&param_buffer, sizeof(dup_param), 
+                                          ACL_MEM_MALLOC_HUGE_FIRST));
+
+                    ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param),
+                                          &param, sizeof(dup_param), 
+                                          ACL_MEMCPY_HOST_TO_DEVICE));
+                    aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(rows_num, 
+                                                                 ctx.stream(), 
+                                                                 src->data, 
+                                                                 dst->data, 
+                                                                 param_buffer);
+                    return;
+                }
+                GGML_ASSERT(false);
+            }
+        }
+        // TODO
+        GGML_ASSERT(false);
+    }
+    else {
+        if (ggml_are_same_shape(src, dst)) {
+            cann_copy(ctx, dst, acl_src, acl_dst);
+            ACL_CHECK(aclDestroyTensor(acl_src));
+            ACL_CHECK(aclDestroyTensor(acl_dst));
+            return;
+        }
+        GGML_ASSERT(false);
+    }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+aclnnStatus aclnnRmsNormGetWorkspaceSize(const aclTensor* x,
+                                         const aclTensor* gamma, double epsilon,
+                                         const aclTensor* yOut,
+                                         const aclTensor* rstdOout,
+                                         uint64_t* workspaceSize,
+                                         aclOpExecutor** executor);
+aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize,
+                         aclOpExecutor* executor, aclrtStream stream);
+#ifdef __cplusplus
+}
+#endif
+
+static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, ggml_tensor* dst,
+                      int64_t* ne, int64_t dims, aclDataType type,
+                      size_t type_size) {
+    int64_t elements = 1;
+    for (int i = 0; i < dims; i++) {
+        elements *= ne[i];
+    }
+    size_t n_bytes = elements * type_size;
+
+    size_t nb[GGML_MAX_DIMS];
+    nb[0] = type_size;
+    for (int i = 1; i < dims; i++) {
+        nb[i] = nb[i - 1] * ne[i - 1];
+    }
+
+    void* buffer = ctx.alloc_buffer(dst, n_bytes);
+    ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream()));
+    aclTensor* zero = create_acl_tensor(buffer, type, type_size, ne, nb, dims);
+    return zero;
+}
+
+static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, ggml_tensor* dst,
+                      int64_t* ne, int64_t dims, aclDataType type,
+                      size_t type_size, float value = 1.0f) {
+    aclTensor* acl_tensor = aclnn_zero(ctx, dst, ne, dims, type, type_size);
+    float alpha_host = 1.0f;
+    aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
+    aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceAddsGetWorkspaceSize(acl_tensor, other, alpha,
+                                               &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+    ACL_CHECK(
+        aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    return acl_tensor;
+}
+
+void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    GGML_ASSERT(eps > 0.0f);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    aclTensor* acl_gamma = aclnn_ones(
+        ctx, dst, src->ne, 1, type_mapping(src->type), ggml_element_size(src));
+
+    int64_t rstd_ne[] = {1, src->ne[1], src->ne[2], src->ne[3]};
+    aclTensor* acl_rstd =
+        aclnn_zero(ctx, dst, rstd_ne, GGML_MAX_DIMS, type_mapping(src->type),
+                   ggml_element_size(src));
+
+    ACL_CHECK(aclnnRmsNormGetWorkspaceSize(
+        acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    ACL_CHECK(
+        aclnnRmsNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyTensor(acl_gamma));
+    ACL_CHECK(aclDestroyTensor(acl_rstd));
+}
+
+// TODO: performace is low.
+void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
+                         float value) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    const int n_past = ((int32_t*)dst->op_params)[0];
+
+    aclTensor* mask_tensor =
+        aclnn_ones(ctx, dst, src->ne, GGML_MAX_DIMS, type_mapping(src->type),
+                   ggml_element_size(src), value);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1,
+                                               &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    ACL_CHECK(
+        aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst,
+                                        &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    aclScalar* alpha = nullptr;
+    float alphaValue = 1.0f;
+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha,
+                                              &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+    ACL_CHECK(
+        aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyScalar(alpha));
+    ACL_CHECK(aclDestroyTensor(mask_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                aclTensor* acl_dst, aclDataType cast_data_type,
+                ggml_tensor* bind_tensor) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+    aclrtStream stream = ctx.stream();
+
+    ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst,
+                                        &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, stream));
+}
+
+static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                   aclTensor* acl_dst, int64_t* new_dim, uint64_t dims,
+                   ggml_tensor* bind_tensor) {
+    aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst,
+                                           &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    ACL_CHECK(
+        aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyIntArray(acl_dims));
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self,
+                                        const aclIntArray* kernelSize,
+                                        const aclIntArray* dilation,
+                                        const aclIntArray* padding,
+                                        const aclIntArray* stride,
+                                        aclTensor* out, uint64_t* workspaceSize,
+                                        aclOpExecutor** executor);
+aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
+                        aclOpExecutor* executor, aclrtStream stream);
+#ifdef __cplusplus
+}
+#endif
+void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];  // kernel
+    ggml_tensor* src1 = dst->src[1];  // input
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int64_t N = is_2D ? ne13 : ne12;
+    const int64_t IC = is_2D ? ne12 : ne11;
+
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
+    aclTensor* acl_src1 = create_acl_tensor(src1);
+    int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
+    size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
+
+    tmp_im2col_nb[0] = ggml_type_size(src1->type);
+    for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+        tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1];
+    }
+
+    // Calculate im2col.
+    // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
+    // dst.elemcount.
+    void* tmp_im2col_buffer =
+        ctx.alloc_buffer(dst, ggml_nelements(dst) * ggml_element_size(src1));
+    aclTensor* tmp_im2col_tensor = create_acl_tensor(
+        tmp_im2col_buffer, type_mapping(src1->type), ggml_type_size(src1->type),
+        tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+
+    std::vector<int64_t> kernel_dims = {KH, KW};
+    std::vector<int64_t> dilation_size = {d1, d0};
+    std::vector<int64_t> padding_dims = {p1, p0};
+    std::vector<int64_t> stride_dims = {s1, s0};
+    auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
+    auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
+    auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
+    auto* strides = aclCreateIntArray(stride_dims.data(), 2);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+    aclrtStream stream = ctx.stream();
+
+    ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations,
+                                          paddings, strides, tmp_im2col_tensor,
+                                          &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnIm2col(workspaceAddr, workspaceSize, executor, stream));
+
+    // Cast if dst is f16.
+    aclTensor* tmp_cast_tensor = nullptr;
+    if (src1->type != dst->type) {
+        void* tmp_cast_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst));
+        size_t temp_cast_nb[GGML_MAX_DIMS - 1];
+        temp_cast_nb[0] = ggml_type_size(dst->type);
+        for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
+            temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
+        }
+
+        tmp_cast_tensor = create_acl_tensor(
+            tmp_cast_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+            tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+        aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor,
+                   type_mapping(dst->type), dst);
+    }
+
+    // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
+    int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
+    size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
+    aclTensor* acl_dst =
+        create_acl_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
+
+    int64_t permute_dim[] = {0, 2, 1};
+    if (src1->type != dst->type) {
+        aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3, dst);
+    } else {
+        aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3, dst);
+    }
+
+    // release
+    ACL_CHECK(aclDestroyTensor(acl_src1));
+    ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyIntArray(kernel_size));
+    ACL_CHECK(aclDestroyIntArray(dilations));
+    ACL_CHECK(aclDestroyIntArray(paddings));
+    ACL_CHECK(aclDestroyIntArray(strides));
+}
+
+static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+               ggml_tensor* bind_tensor) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(
+        aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    ACL_CHECK(
+        aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, 
+                       float scale, aclTensor* acl_dst, bool inplace, 
+                       ggml_tensor* bind_tensor) {
+    aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    if (inplace) {
+        ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale,
+                                                   &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+        }
+
+        ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor, 
+                                   ctx.stream()));
+    }
+    else {
+        ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst,
+                                            &workspaceSize, &executor));
+        if (workspaceSize > 0) {
+            workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+        }
+
+        ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, 
+                            ctx.stream()));
+    }
+    
+
+    ACL_CHECK(aclDestroyScalar(acl_scale));
+}
+
+static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                       aclTensor* acl_other, ggml_tensor* bind_tensor) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other,
+                                              &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    ACL_CHECK(
+        aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+static void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                          aclTensor* acl_other, aclTensor* acl_dst,
+                          ggml_tensor* bind_tensor) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst,
+                                       &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+               aclTensor* acl_dst, ggml_tensor* bind_tensor) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(
+        aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+               aclTensor* acl_dst, ggml_tensor* bind_tensor) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(
+        aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
+                                  ggml_tensor* dst) {
+    const ggml_tensor* src = dst->src[0];
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int dim = dst->op_params[0];
+    const int max_period = dst->op_params[1];
+    int half = dim / 2;
+
+    aclTensor* acl_src = create_acl_tensor(src);
+
+    // arange: [0, ..., half)
+    float start = 0;
+    float stop = half;
+    float step = 1;
+    int64_t n_elements_arange = half;
+    int64_t tmp_arange_ne[] = {half};
+    size_t tmp_arange_nb[] = {sizeof(dst->type)};
+
+    void* tmp_arange_buffer = ctx.alloc_buffer(dst, half * sizeof(dst->type));
+    aclTensor* tmp_arange_tensor = create_acl_tensor(
+        tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_arange_ne, tmp_arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange,
+                 dst);
+
+    // freq
+    float freq_param = -logf(max_period) / half;
+    bool inplace = true;
+    aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace, dst);
+    aclnn_exp(ctx, tmp_arange_tensor, dst);
+
+    // permute: src [0,1,2,3]->[0,1,3,2]
+    int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
+    size_t tmp_permute_nb[GGML_MAX_DIMS];
+    tmp_permute_nb[0] = ggml_type_size(src->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
+    }
+
+    void* tmp_permute_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src));
+    aclTensor* tmp_permute_tenosr = create_acl_tensor(
+        tmp_permute_buffer, type_mapping(src->type), ggml_type_size(src->type),
+        tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+    int64_t permute_dim[] = {0, 1, 3, 2};
+    int64_t num_dims = 4;
+    aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims, dst);
+
+    // timestep * freq
+    int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
+                            src->ne[3]};
+    size_t tmp_mul_nb[GGML_MAX_DIMS];
+    tmp_mul_nb[0] = ggml_type_size(src->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
+    }
+
+    int mul_nelements =
+        src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
+
+    void* tmp_mul_buffer =
+        ctx.alloc_buffer(dst, mul_nelements * ggml_type_size(src->type));
+    aclTensor* tmp_mul_tensor = create_acl_tensor(
+        tmp_mul_buffer, type_mapping(src->type), ggml_type_size(src->type),
+        tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+    aclnn_noinplcace_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor,
+                         tmp_mul_tensor, dst);
+
+    // cos
+    void* tmp_cos_buffer =
+        ctx.alloc_buffer(dst, mul_nelements * ggml_type_size(src->type));
+    aclTensor* tmp_cos_tensor = create_acl_tensor(
+        tmp_cos_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+
+    aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor, dst);
+
+    // sin
+    void* tmp_sin_buffer =
+        ctx.alloc_buffer(dst, mul_nelements * ggml_type_size(src->type));
+    aclTensor* tmp_sin_tensor = create_acl_tensor(
+        tmp_sin_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+
+    aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor, dst);
+
+    // concat
+    int64_t concat_dim = 3;
+    aclTensor* acl_dst = create_acl_tensor(dst);
+    aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
+    aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
+    aclnn_concat(ctx, tensorList, acl_dst, concat_dim, dst);
+
+    // release
+    // segmentation fault when delete both tensorList and his elements.
+    ACL_CHECK(aclDestroyTensorList(tensorList));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
+    ACL_CHECK(aclDestroyTensor(tmp_mul_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
+                       aclTensor* acl_dst, ggml_tensor* bind_tensor) {
+    // fill acl_dst with scalar value.
+
+    auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize(
+        acl_dst, acl_scalar, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor,
+                                     ctx.stream()));
+    ACL_CHECK(aclDestroyScalar(acl_scalar));
+}
+
+static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, 
+                                    aclTensor* acl_dst, aclTensor* acl_exp, 
+                                    ggml_tensor* bind_tensor) {
+    // acl_dst = acl_dst^acl_exp
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize(
+        acl_dst, acl_exp, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize,
+                                          executor, ctx.stream()));
+}
+
+static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                 aclTensor* acl_position, aclTensor* acl_dst, const int n_head,
+                 int64_t* src_ne, const size_t src_nb0, float max_bias, 
+                 ggml_tensor* dst) {
+    GGML_UNUSED(src_ne[1]);
+    const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
+    GGML_ASSERT(src_nb0 == sizeof(float));
+    GGML_ASSERT(n_head == src_ne[2]);
+
+    const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
+
+    float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    // init arange
+    void* tmp_arange_buffer =
+        ctx.alloc_buffer(dst, ne2_ne3 * ggml_type_size(dst->type));
+    size_t memset_size = ne2_ne3 * ggml_type_size(dst->type);
+    ACL_CHECK(aclrtMemset(tmp_arange_buffer, memset_size, 0, memset_size));
+
+    // arange1: [1, ..., n_heads_log2_floor+1)
+    float start = 1;
+    float stop = n_heads_log2_floor + 1;
+    float step = 1;
+    int64_t n_elements_arange = n_heads_log2_floor;
+
+    int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
+    size_t tmp_arange1_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_arange1_tensor = create_acl_tensor(
+        tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_arange1_ne, tmp_arange1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange,
+                 dst);
+
+    aclTensor* tmp_arange2_tensor = nullptr;
+    if (n_heads_log2_floor < ne2_ne3) {
+        // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
+        start = 1;
+        stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
+        step = 2;
+        n_elements_arange = ne2_ne3 - n_heads_log2_floor;
+        int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t tmp_arange2_nb[] = {sizeof(dst->type)};
+
+        aclTensor* tmp_arange2_tensor = create_acl_tensor(
+            (char*)tmp_arange_buffer + n_heads_log2_floor * ggml_type_size(dst->type),
+            type_mapping(dst->type), ggml_type_size(dst->type), tmp_arange2_ne,
+            tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
+                     n_elements_arange, dst);
+    }
+
+    // init mk_base
+    void* tmp_mk_base_buffer =
+        ctx.alloc_buffer(dst, ne2_ne3 * ggml_type_size(dst->type));
+    int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
+    size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_mk_base1_tensor = create_acl_tensor(
+        tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_mk_base1_ne, tmp_mk_base1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+    aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor, dst);
+    
+    aclTensor* tmp_mk_base2_tensor = nullptr;
+    if (n_heads_log2_floor < ne2_ne3) {
+        int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
+        aclTensor* tmp_mk_base2_tensor = create_acl_tensor(
+            (char*)tmp_mk_base_buffer + n_heads_log2_floor * ggml_type_size(dst->type),
+            type_mapping(dst->type), ggml_type_size(dst->type), tmp_mk_base2_ne,
+            tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor, dst);
+    }
+
+    // init mk
+    int64_t tmp_mk_base_ne[] = {ne2_ne3};
+    size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
+    aclTensor* tmp_mk_base_tensor = create_acl_tensor(
+        tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_mk_base_ne, tmp_mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    aclTensor* tmp_arange_tensor = create_acl_tensor(
+        tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_mk_base_ne, tmp_mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor, dst);
+
+    // reshape mk
+    int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
+    size_t tmp_mk_nb[GGML_MAX_DIMS];
+    tmp_mk_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
+    }
+    aclTensor* tmp_mk_tensor = create_acl_tensor(
+        tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+
+    // acl_position * mk
+    int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
+    size_t tmp_output_nb[GGML_MAX_DIMS];
+    tmp_output_nb[0] = ggml_type_size(dst->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
+    }
+    void* tmp_output_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst));
+    aclTensor* tmp_output_tensor = create_acl_tensor(
+        tmp_output_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+    aclnn_noinplcace_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor,
+                         dst);
+
+    // add
+    aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst, dst);
+
+    ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
+}
+
+void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    const int n_head = ((int32_t*)dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t*)dst->op_params + 2, sizeof(float));
+
+    const int64_t ne0 = src->ne[0];  // all_seq_len = n_past + ne1
+    const int64_t ne2 = src->ne[2];  // n_head -> this is k
+    const size_t nb0 = src->nb[0];
+
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(n_head == ne2);
+
+    // position arange: [0, ..., ne0)
+    float start = 0;
+    float stop = ne0;
+    float step = 1;
+    int64_t n_elements_arange = ne0;
+    int64_t tmp_position_ne[] = {ne0, 1, 1, 1};
+    size_t tmp_position_nb[] = {sizeof(dst->type)};
+
+    void* tmp_position_buffer = ctx.alloc_buffer(dst, ne0 * sizeof(dst->type));
+    aclTensor* tmp_position_tensor = create_acl_tensor(
+        tmp_position_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        tmp_position_ne, tmp_position_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+
+    aclnn_arange(ctx, tmp_position_tensor, start, stop, step, n_elements_arange,
+                 dst);
+
+    // call alibi
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+    aclnn_alibi(ctx, acl_src, tmp_position_tensor, acl_dst, n_head, src->ne, nb0, max_bias, dst);
+
+    ACL_CHECK(aclDestroyTensor(tmp_position_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_cann_dup(ctx, dst);
+}
+
+static void aclnn_inplace_add(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                       aclTensor* acl_dst, ggml_tensor* bind_tensor) {
+    aclScalar* alpha = nullptr;
+    float alphaValue = 1.0f;
+    alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha,
+                                              &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(
+        aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyScalar(alpha));
+}
+
+static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                          int64_t dim, aclTensor* acl_dst,
+                          ggml_tensor* bind_tensor) {
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(acl_src, dim, acl_dst, 
+                                           &workspaceSize, &executor));
+    
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+    
+    aclrtStream stream = ctx.stream();
+    ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream));
+}
+
+void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];  // mask
+
+    aclTensor* acl_src0 = create_acl_tensor(src0);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    float scale = 1.0f;
+    float max_bias = 0.0f;
+
+    memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
+
+    // input mul scale
+    aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
+
+    size_t n_bytes = ggml_nbytes(src0);
+    void* input_mul_scale_buffer = ctx.alloc_buffer(dst, n_bytes);
+    aclTensor* acl_input_mul_scale_tensor = create_acl_tensor(
+                                                    input_mul_scale_buffer, 
+                                                    ACL_FLOAT, 
+                                                    ggml_type_size(src0->type), 
+                                                    src0->ne, src0->nb, 
+                                                    GGML_MAX_DIMS);
+
+    bool inplace = false;
+    aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace, 
+               dst);
+
+    // mask
+    aclTensor* acl_src1_fp32_tensor = nullptr;
+    aclTensor* tmp_permute_tenosr = nullptr;
+    if (src1) {
+        const bool use_f16 = src1->type == GGML_TYPE_F16;
+        if (use_f16) {
+            // cast to fp32
+            size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
+            size_t src1_fp32_nb[GGML_MAX_DIMS];
+            src1_fp32_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
+            }
+            void* src1_fp32_buffer = ctx.alloc_buffer(dst, n_bytes);
+            acl_src1_fp32_tensor = create_acl_tensor(src1_fp32_buffer, 
+                                                            ACL_FLOAT, 
+                                                            sizeof(float),
+                                                            src1->ne, 
+                                                            src1_fp32_nb, 
+                                                            GGML_MAX_DIMS);
+            aclTensor* acl_src1 = create_acl_tensor(src1);
+            aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT, dst);
+
+            ACL_CHECK(aclDestroyTensor(acl_src1));
+            
+        }
+        else {
+            acl_src1_fp32_tensor = create_acl_tensor(src1);
+        }
+        
+        // broadcast the mask across rows
+        if (src1->ne[1] != src0->ne[1]) {
+            // mask shape: [1,1,a,b]
+            int64_t tmp_permute_ne[] = {src1->ne[0], src1->ne[2], src1->ne[1], src1->ne[3]};
+            size_t tmp_permute_nb[GGML_MAX_DIMS];
+            tmp_permute_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
+            }
+
+            void* tmp_permute_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src1));
+            tmp_permute_tenosr = create_acl_tensor(
+                tmp_permute_buffer, ACL_FLOAT, sizeof(float),
+                tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+            int64_t permute_dim[] = {0, 2, 1, 3};
+            int64_t num_dims = 4;
+            aclnn_permute(ctx, acl_src1_fp32_tensor, tmp_permute_tenosr, permute_dim, num_dims, dst);   
+        }
+        
+        // alibi
+        const int n_head = src0->ne[2];
+        const size_t src_nb0 = src0->nb[0];
+
+        n_bytes = ggml_nbytes(dst);
+        void* output_buffer = ctx.alloc_buffer(dst, n_bytes);
+        aclTensor* alibi_output_tensor = create_acl_tensor(
+                                                      output_buffer, 
+                                                      ACL_FLOAT,
+                                                      ggml_type_size(dst->type),
+                                                      dst->ne, dst->nb, 
+                                                      GGML_MAX_DIMS);
+        if (max_bias <=0.0f) {
+            // slope = 1.0
+            if (tmp_permute_tenosr) {
+                aclnn_add(ctx, tmp_permute_tenosr, acl_input_mul_scale_tensor, 
+                      alibi_output_tensor, dst);
+            }
+            else {
+                aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor, 
+                      alibi_output_tensor, dst);
+            }
+            
+        }
+        else {
+            // slope != 1.0
+            if (tmp_permute_tenosr) {
+                aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_permute_tenosr, 
+                            alibi_output_tensor, n_head, src0->ne, src_nb0, max_bias, 
+                            dst);
+            } 
+            else {
+                aclnn_alibi(ctx, acl_input_mul_scale_tensor, acl_src1_fp32_tensor, 
+                            alibi_output_tensor, n_head, src0->ne, src_nb0, max_bias, 
+                            dst);
+            }
+        }
+        
+        // softmax
+        aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst, dst);
+        ACL_CHECK(aclDestroyTensor(alibi_output_tensor));
+    } 
+    else {
+        aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst, dst);
+    }
+
+    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+    ACL_CHECK(aclDestroyScalar(acl_scale));
+    ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor));
+    ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr));
+    
+}
+
+void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            aclrtlaunch_ascendc_get_row_f32(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src0->extra)->nb,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
+            break;
+        case GGML_TYPE_F16:
+            aclrtlaunch_ascendc_get_row_f16(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src0->extra)->nb,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
+            break;
+        case GGML_TYPE_Q4_0:
+            aclrtlaunch_ascendc_get_row_q4_0(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
+            break;
+        case GGML_TYPE_Q8_0:
+            aclrtlaunch_ascendc_get_row_q8_0(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
+            break;
+        default:
+            GGML_ASSERT(false);  
+            break;
+    }
+}
+
+static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                             aclTensor* acl_dst, int64_t dim, int64_t repeats, 
+                             int64_t output_size, ggml_tensor* bind_tensor) {
+    // each elem in acl_src will repeat. repeat number is `repeats`, repeats dim
+    // is `dim`.
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(acl_src, repeats, 
+                                                              dim, output_size, 
+                                                              acl_dst,
+                                                              &workspaceSize, 
+                                                              &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(
+        aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize, executor, 
+                                        main_stream));
+
+}
+
+static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
+                   aclTensor* acl_weight, aclTensor* acl_dst, 
+                   ggml_tensor* bind_tensor) {
+    int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is fp32,
+                               // atlas a2 will transpose it to HFLOAT32.
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                          cube_math_type, &workspaceSize, 
+                                          &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(aclnnMatmul(workspaceAddr, workspaceSize, executor, 
+                               main_stream));
+}
+
+static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];  // weight
+    ggml_tensor* src1 = dst->src[1];  // input
+
+    // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto broadcast,
+    // when weight ne2 or ne3 is not 1, weight need repeat. 
+    int repeat_dim2 = 1;
+    if (src0->ne[2] != src1->ne[2] && src0->ne[2] !=1) {
+        repeat_dim2 = src1->ne[2] / src0->ne[2];    
+    }
+    int repeat_dim3 = 1;
+    if (src0->ne[3] != src1->ne[3] && src0->ne[3] !=1) {
+        repeat_dim3 = src1->ne[3] / src0->ne[3];    
+    }
+
+    int64_t weight_repeat_ne[] = {src0->ne[0], src0->ne[1], 
+                                  src0->ne[2]*repeat_dim2, 
+                                  src0->ne[3]*repeat_dim3};
+    size_t weight_repeat_nb[GGML_MAX_DIMS];
+    weight_repeat_nb[0] = src0->nb[0];
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        weight_repeat_nb[i] = weight_repeat_nb[i - 1] * weight_repeat_ne[i - 1];
+    };
+    
+    void* acl_repeat_weight_buffer = nullptr;
+    aclTensor* acl_repeat_weight_tensor = nullptr;
+    aclTensor* acl_weight_tensor =create_acl_tensor(src0);
+    if (repeat_dim2 > 1) {
+        weight_repeat_ne[3] = src0->ne[3];
+        acl_repeat_weight_buffer = ctx.alloc_buffer(dst, 
+                                                    ggml_nelements(src0)
+                                                    *repeat_dim2
+                                                    *ggml_type_size(src0->type));
+        acl_repeat_weight_tensor = create_acl_tensor(acl_repeat_weight_buffer, 
+                                                     type_mapping(src0->type), 
+                                                     ggml_type_size(src0->type), 
+                                                     weight_repeat_ne, 
+                                                     weight_repeat_nb, 
+                                                     GGML_MAX_DIMS);
+
+        int64_t dim = 1;
+        int64_t output_size = src0->ne[2]*repeat_dim2;
+        aclnn_repeat_interleave(ctx, acl_weight_tensor, 
+                                acl_repeat_weight_tensor, dim, repeat_dim2, 
+                                output_size, dst);
+    }
+    if (repeat_dim3 > 1) {
+        weight_repeat_ne[3] = src0->ne[3]*repeat_dim3;
+        acl_repeat_weight_buffer = ctx.alloc_buffer(dst, 
+                                                    ggml_nelements(src0)
+                                                    *repeat_dim2*repeat_dim3
+                                                    *ggml_type_size(src0->type));
+        aclTensor* acl_repeat_weight_tensor2 = create_acl_tensor(
+                                                     acl_repeat_weight_buffer, 
+                                                     type_mapping(src0->type), 
+                                                     ggml_type_size(src0->type), 
+                                                     weight_repeat_ne, 
+                                                     weight_repeat_nb, 
+                                                     GGML_MAX_DIMS);
+        int64_t dim = 0;
+        int64_t output_size = src0->ne[3]*repeat_dim3;
+        if (acl_repeat_weight_tensor==nullptr) {
+            aclnn_repeat_interleave(ctx, acl_weight_tensor, 
+                                    acl_repeat_weight_tensor2, dim, repeat_dim3, 
+                                    output_size, dst);
+        }
+        else {
+            aclnn_repeat_interleave(ctx, acl_repeat_weight_tensor, 
+                                    acl_repeat_weight_tensor2, dim, repeat_dim3, 
+                                    output_size, dst);
+        }
+        ACL_CHECK(aclDestroyTensor(acl_repeat_weight_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_repeat_weight_tensor2));
+    }
+    if (acl_repeat_weight_buffer==nullptr) {
+        acl_repeat_weight_buffer = src0->data; 
+    }
+
+    // transpose weight: [1,2,3,4] -> [1,2,4,3]
+    int64_t weight_ne[] = {weight_repeat_ne[1], weight_repeat_ne[0], 
+                           weight_repeat_ne[2], weight_repeat_ne[3]};
+    size_t weight_nb[] = {weight_repeat_nb[1], weight_repeat_nb[0], 
+                          weight_repeat_nb[2], weight_repeat_nb[3]};
+
+    aclTensor* acl_weight_transpose_tensor = create_acl_tensor(
+                                                     acl_repeat_weight_buffer, 
+                                                     type_mapping(src0->type), 
+                                                     ggml_type_size(src0->type), 
+                                                     weight_ne, 
+                                                     weight_nb, 
+                                                     GGML_MAX_DIMS);
+    
+    // mul_mat
+    aclTensor* acl_input_tensor = create_acl_tensor(src1);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+    aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_transpose_tensor, acl_dst, 
+                  dst);
+
+    ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_weight_transpose_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));    
+}
+
+static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];  // weight
+    ggml_tensor* src1 = dst->src[1];  // input
+
+    // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
+    // is regarded as batch. weight need transpose.
+    int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
+    size_t weight_elem_size = sizeof(uint8_t);
+    size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
+    // size of one matrix is element_size * height * width.
+    size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
+    size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
+
+    // scale stored at the end of weight. Also need transpose.
+    int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
+    size_t scale_elem_size = sizeof(uint16_t);
+    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
+                         scale_elem_size};
+    size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
+    char* scale_offset = (char*)src0->data + weight_size;
+
+    // input
+    void* input_buffer;
+    size_t input_elem_size = sizeof(uint16_t);
+    int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
+    size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
+    size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
+
+    if (src1->type != GGML_TYPE_F16) {
+        aclTensor* acl_src1_tensor = create_acl_tensor(src1);
+        input_buffer =
+            ctx.alloc_buffer(dst, ggml_nelements(src1) * input_elem_size);
+
+        int64_t* input_cast_ne = src1->ne;
+        size_t input_cast_nb[GGML_MAX_DIMS];
+        input_cast_nb[0] = sizeof(uint16_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
+        }
+
+        aclTensor* acl_input_tensor =
+            create_acl_tensor(input_buffer, ACL_FLOAT16, input_elem_size,
+                              input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
+        aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16, dst);
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
+    } else {
+        input_buffer = src1->data;
+    }
+
+    // output
+    size_t output_elem_size = sizeof(uint16_t);
+    int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
+    size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
+    void* output_buffer =
+        ctx.alloc_buffer(dst, ggml_nelements(dst) * output_elem_size);
+    size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
+
+    // aclnn
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
+        for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
+            int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
+            int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
+
+            int64_t batch1 = n1 * src1->ne[2] + c1;
+            int64_t batch0 = n0 * src0->ne[2] + c0;
+
+            aclTensor* acl_input_tensor = create_acl_tensor(
+                (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
+                input_elem_size, input_ne, input_nb, 2);
+            aclTensor* acl_weight_tensor = create_acl_tensor(
+                (char*)src0->data + batch0 * weight_stride, ACL_INT8,
+                weight_elem_size, weight_ne, weight_nb, 2);
+            aclTensor* acl_scale_tensor = create_acl_tensor(
+                scale_offset + batch0 * scale_stride, ACL_FLOAT16,
+                scale_elem_size, scale_ne, scale_nb, 2);
+            aclTensor* acl_output_tensor = create_acl_tensor(
+                (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
+                output_elem_size, output_ne, output_nb, 2);
+
+            ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
+                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
+                nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
+                &workspaceSize, &executor));
+
+            if (workspaceSize > 0 && workspaceAddr == nullptr) {
+                workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+            }
+
+            ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
+                workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+        }
+    }
+
+    // cast out
+    int64_t* output_cast_ne = dst->ne;
+    size_t output_cast_nb[GGML_MAX_DIMS];
+    output_cast_nb[0] = sizeof(uint16_t);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
+    }
+
+    aclTensor* acl_output_tensor =
+        create_acl_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
+                          output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
+    aclTensor* acl_dst_tensor = create_acl_tensor(dst);
+    aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT, dst);
+
+    ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
+}
+
+void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    const enum ggml_type type = dst->src[0]->type;
+    switch (type) {
+        case GGML_TYPE_F32:
+        case GGML_TYPE_F16:
+            ggml_cann_mat_mul_fp(ctx, dst);
+            break;
+        // case GGML_TYPE_Q4_0:
+        //     ggml_cann_mul_mat_q4_0(ctx, dst);
+        //     break;
+        case GGML_TYPE_Q8_0:
+            ggml_cann_mul_mat_q8_0(ctx, dst);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
+static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                aclTensor* acl_dst, int64_t* shifts, int64_t* dims, 
+                ggml_tensor* bind_tensor) {
+
+    aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
+    aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst, 
+                                        &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(
+        aclnnRoll(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyIntArray(acl_shifts));
+    ACL_CHECK(aclDestroyIntArray(acl_dims));
+}
+
+void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0]; // input
+    ggml_tensor* src1 = dst->src[1]; // position
+
+    // param init
+    rope_param param;
+    for (int i=0; i<4; i++) {
+        param.input_ne[i] = src0->ne[i];
+        param.position_ne[i] = src1->ne[i];
+    }
+    
+    const int mode = ((int32_t *) dst->op_params)[2];
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+    param.is_neox = is_neox;
+    param.is_glm = is_glm;
+
+    memcpy(&param.freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&param.freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&param.ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&param.attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&param.beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&param.beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    
+    param.n_dims = ((int32_t *) dst->op_params)[1];
+    param.n_orig_ctx = ((int32_t *) dst->op_params)[4];
+
+    const float theta_scale = powf(param.freq_base, -2.0f/param.n_dims);
+    param.theta_scale = theta_scale;
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(param.n_dims, param.n_orig_ctx, param.freq_base, 
+                             param.beta_fast, param.beta_slow, corr_dims);
+    param.corr_dims[0] = corr_dims[0];
+    param.corr_dims[1] = corr_dims[1];        
+
+    // param copy
+    void *param_buffer;
+    ACL_CHECK(aclrtMalloc(&param_buffer, sizeof(rope_param), 
+                          ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(rope_param), &param, 
+                          sizeof(rope_param), ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // cast position: i32 to fp32
+    aclTensor* acl_position_tensor = create_acl_tensor(src1);
+    void* position_cast_buffer = ctx.alloc_buffer(dst, ggml_nelements(src1) 
+                                                       * sizeof(float_t));
+    int64_t* position_cast_ne = src1->ne;
+    size_t position_cast_nb[GGML_MAX_DIMS];
+    position_cast_nb[0] = sizeof(float_t);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        position_cast_nb[i] = position_cast_nb[i - 1] * position_cast_ne[i - 1];
+    }
+    aclTensor* acl_postion_cast_tensor = create_acl_tensor(position_cast_buffer, 
+                                                           ACL_FLOAT, 
+                                                           sizeof(float_t), 
+                                                           position_cast_ne, 
+                                                           position_cast_nb, 
+                                                           GGML_MAX_DIMS);
+
+    aclnn_cast(ctx, acl_position_tensor, acl_postion_cast_tensor, ACL_FLOAT, 
+               dst);
+    
+    // init cos/sin cache, 
+    void* sin_buffer = ctx.alloc_buffer(dst, src0->ne[0] * src0->ne[2] 
+                                             * sizeof(float_t));
+    void* cos_buffer = ctx.alloc_buffer(dst, src0->ne[0] * src0->ne[2] 
+                                             * sizeof(float_t));
+
+
+    aclrtlaunch_ascendc_rope_init_cache(param.position_ne[0], ctx.stream(), 
+                                        position_cast_buffer, 
+                                        sin_buffer, cos_buffer,
+                                        param_buffer,
+                                        ((ggml_tensor*)src0->extra)->ne);
+    ACL_CHECK(aclrtFree(param_buffer));
+
+    // reshape sin&cos
+    // TODO: ne[3] != 0
+    int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
+    size_t sin_reshape_nb[GGML_MAX_DIMS];
+    sin_reshape_nb[0] = sizeof(float_t);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
+    }
+    aclTensor* acl_sin_reshape_tensor = create_acl_tensor(sin_buffer, ACL_FLOAT, 
+                                                          sizeof(float_t), 
+                                                          sin_reshape_ne, 
+                                                          sin_reshape_nb, 
+                                                          GGML_MAX_DIMS);
+    aclTensor* acl_cos_reshape_tensor = create_acl_tensor(cos_buffer, ACL_FLOAT, 
+                                                          sizeof(float_t), 
+                                                          sin_reshape_ne, 
+                                                          sin_reshape_nb, 
+                                                          GGML_MAX_DIMS);
+    
+    // TODO: warp the following as aclrtlaunch_ascendc_rope_cal<<<>>>
+    // roll input
+    void* input_roll_buffer;
+    aclTensor* acl_minus_one_tensor;
+    if (is_glm) {
+        // TODO
+        GGML_ASSERT(false);  
+    }
+    else if (!is_neox) {
+        // roll input: [q0,q1,q2,...] -> [q1,q0,q3,q2...]
+        input_roll_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src0));
+        int64_t input_roll_ne[4] = {2, src0->ne[1]*(src0->ne[0]/2), src0->ne[2], 
+                                    src0->ne[3]};
+        size_t input_roll_nb[GGML_MAX_DIMS];
+        input_roll_nb[0] = ggml_type_size(src0->type);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
+        }
+        aclTensor* acl_input_roll_tensor = create_acl_tensor(
+                                                     input_roll_buffer, 
+                                                     type_mapping(src0->type), 
+                                                     ggml_type_size(src0->type), 
+                                                     input_roll_ne, 
+                                                     input_roll_nb, 
+                                                     GGML_MAX_DIMS);
+        aclTensor* acl_input_tensor = create_acl_tensor(
+                                                     src0->data, 
+                                                     type_mapping(src0->type),
+                                                     ggml_type_size(src0->type), 
+                                                     input_roll_ne, 
+                                                     input_roll_nb, 
+                                                     GGML_MAX_DIMS);
+
+        int64_t shifts[] = {1};
+        int64_t dims[] = {3};
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims, 
+                   dst);  
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+
+        // init [-1, 1, -1, 1, ...]
+        void* minus_one_scale_buffer = ctx.alloc_buffer(dst, sizeof(int64_t)
+                                                             * src0->ne[0]);
+        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+        size_t minus_one_nb[GGML_MAX_DIMS];
+        minus_one_nb[0] = sizeof(int64_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+        }
+        acl_minus_one_tensor = create_acl_tensor(minus_one_scale_buffer, 
+                                                 ACL_INT64, sizeof(int64_t), 
+                                                 minus_one_ne, minus_one_nb, 
+                                                 GGML_MAX_DIMS);
+        int64_t* minus_one_scale = new int64_t[src0->ne[0]];
+        for (int i=0; i<src0->ne[0]; i+=2) {
+            minus_one_scale[i] = -1.0;
+            minus_one_scale[i+1] = 1.0;
+        }
+
+        aclrtMemcpy(minus_one_scale_buffer, src0->ne[0] * sizeof(int64_t), 
+                    minus_one_scale, src0->ne[0] * sizeof(int64_t), 
+                    ACL_MEMCPY_HOST_TO_DEVICE);
+        delete[] minus_one_scale;
+    }
+    else {
+        // roll input: [q0,q1,q2,...] -> [q_half,q_half+1,..., q0,q1,...q_half-1]
+        input_roll_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src0));
+        aclTensor* acl_input_roll_tensor = create_acl_tensor(
+                                                     input_roll_buffer, 
+                                                     type_mapping(src0->type), 
+                                                     ggml_type_size(src0->type), 
+                                                     src0->ne, src0->nb, 
+                                                     GGML_MAX_DIMS);
+        aclTensor* acl_input_tensor = create_acl_tensor(src0);
+
+        int64_t shifts[] = {src0->ne[0] / 2};
+        int64_t dims[] = {3};
+        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims, 
+                   dst);
+        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+
+        // init [-1, -1, -1, 1, 1，1，...]
+        void* minus_one_scale_buffer = ctx.alloc_buffer(dst, sizeof(int64_t) 
+                                                             * src0->ne[0]);
+        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
+        size_t minus_one_nb[GGML_MAX_DIMS];
+        minus_one_nb[0] = sizeof(int64_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
+        }
+        acl_minus_one_tensor = create_acl_tensor(minus_one_scale_buffer, 
+                                                 ACL_INT64, sizeof(int64_t), 
+                                                 minus_one_ne, minus_one_nb, 
+                                                 GGML_MAX_DIMS);
+        int64_t* minus_one_scale = new int64_t[src0->ne[0]];
+        for (int i=0; i<src0->ne[0]; i++) {
+            if (i < src0->ne[0]/2) {
+                minus_one_scale[i] = -1.0;    
+            }
+            else {
+                minus_one_scale[i] = 1.0;
+            }
+        }
+
+        aclrtMemcpy(minus_one_scale_buffer, src0->ne[0] * sizeof(int64_t), 
+                    minus_one_scale, src0->ne[0] * sizeof(int64_t), 
+                    ACL_MEMCPY_HOST_TO_DEVICE);
+        delete[] minus_one_scale;
+    }
+    
+    // input * scale
+    void* input_roll_mul_scale_buffer = ctx.alloc_buffer(dst, 
+                                                         ggml_nbytes(src0));
+    size_t input_nb[GGML_MAX_DIMS];
+    input_nb[0] = ggml_type_size(src0->type);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
+    }
+    aclTensor* acl_input_roll_mul_scale_tensor = create_acl_tensor(
+                                                    input_roll_mul_scale_buffer, 
+                                                    type_mapping(src0->type), 
+                                                    ggml_type_size(src0->type), 
+                                                    src0->ne, input_nb, 
+                                                    GGML_MAX_DIMS);
+    aclTensor* acl_input_roll_reshape_tensor = create_acl_tensor(
+                                                    input_roll_buffer, 
+                                                    type_mapping(src0->type), 
+                                                    ggml_type_size(src0->type), 
+                                                    src0->ne, input_nb, 
+                                                    GGML_MAX_DIMS);                                             
+    aclnn_noinplcace_mul(ctx, acl_input_roll_reshape_tensor, 
+                         acl_minus_one_tensor, acl_input_roll_mul_scale_tensor, 
+                         dst); 
+    
+    // output
+    aclTensor* acl_src0 = create_acl_tensor(src0);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+    void* output_fp32_buffer;
+    if (src0->type == GGML_TYPE_F32) {
+        aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor, dst);
+        aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, 
+                          acl_sin_reshape_tensor, dst);
+        aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst, dst);
+        // TODO: zeta scaling for xPos
+        // TODO: ne0 != n_dims in mode2
+    }
+    else if (src0->type == GGML_TYPE_F16) {
+        size_t input_fp32_nb[GGML_MAX_DIMS];
+        input_fp32_nb[0] = sizeof(float_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
+        }
+        void* input_fp32_buffer1 = ctx.alloc_buffer(dst, ggml_nelements(dst) 
+                                                         * sizeof(float_t));
+        aclTensor* input_fp32_tensor1 = create_acl_tensor(input_fp32_buffer1, 
+                                                          ACL_FLOAT, 
+                                                          sizeof(float_t), 
+                                                          dst->ne, 
+                                                          input_fp32_nb, 
+                                                          GGML_MAX_DIMS);
+        void* input_fp32_buffer2 = ctx.alloc_buffer(dst, ggml_nelements(dst) 
+                                                         * sizeof(float_t));
+        aclTensor* input_fp32_tensor2 = create_acl_tensor(input_fp32_buffer2, 
+                                                          ACL_FLOAT, 
+                                                          sizeof(float_t), 
+                                                          dst->ne, 
+                                                          input_fp32_nb, 
+                                                          GGML_MAX_DIMS);
+
+        output_fp32_buffer = ctx.alloc_buffer(dst, ggml_nelements(dst) 
+                                                   * sizeof(float_t));
+        aclTensor* output_fp32_tensor = create_acl_tensor(output_fp32_buffer, 
+                                                          ACL_FLOAT,
+                                                          sizeof(float_t), 
+                                                          dst->ne, 
+                                                          input_fp32_nb, 
+                                                          GGML_MAX_DIMS);
+        aclnn_noinplcace_mul(ctx, acl_src0, acl_cos_reshape_tensor, 
+                             input_fp32_tensor1, dst);
+        aclnn_noinplcace_mul(ctx, acl_input_roll_mul_scale_tensor, 
+                             acl_sin_reshape_tensor, input_fp32_tensor2, 
+                             dst);
+        aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, 
+                  output_fp32_tensor, dst);
+        aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16, dst);
+
+        ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
+        ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
+        ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
+    }
+
+    ACL_CHECK(aclDestroyTensor(acl_position_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_postion_cast_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
new file mode 100644
index 0000000000000..56d975990e0f9
--- /dev/null
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -0,0 +1,182 @@
+#ifndef CANN_ACLNN_OPS
+#define CANN_ACLNN_OPS
+
+#include <aclnnop/aclnn_add.h>
+#include <aclnnop/aclnn_arange.h>
+#include <aclnnop/aclnn_argsort.h>
+#include <aclnnop/aclnn_cat.h>
+#include <aclnnop/aclnn_clamp.h>
+#include <aclnnop/aclnn_div.h>
+#include <aclnnop/aclnn_gelu.h>
+#include <aclnnop/aclnn_hardsigmoid.h>
+#include <aclnnop/aclnn_hardswish.h>
+#include <aclnnop/aclnn_leaky_relu.h>
+#include <aclnnop/aclnn_mul.h>
+#include <aclnnop/aclnn_relu.h>
+#include <aclnnop/aclnn_silu.h>
+#include <aclnnop/aclnn_tanh.h>
+#include "acl_tensor.h"
+#include "common.h"
+
+void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
+
+void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, 
+                                  ggml_tensor* dst);
+
+template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
+                                       aclTensor*, uint64_t*, aclOpExecutor**),
+          aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
+void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    aclTensor* acl_src0;
+    aclTensor* acl_src1;
+    aclTensor* acl_dst;
+
+    // Need bcast
+    if (!ggml_are_same_shape(src0, src1) && need_bcast(src0, src1)) {
+        BCAST_SHAPE(src0, src1)
+        acl_src0 = create_acl_tensor(src0, BCAST_PARAM(src0));
+        acl_src1 = create_acl_tensor(src1, BCAST_PARAM(src1));
+        acl_dst = create_acl_tensor(dst, BCAST_PARAM(src0));
+    } else {
+        acl_src0 = create_acl_tensor(src0);
+        acl_src1 = create_acl_tensor(src1);
+        acl_dst = create_acl_tensor(dst);
+    }
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize,
+                               &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyTensor(acl_src0));
+    ACL_CHECK(aclDestroyTensor(acl_src1));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+// Activation functions template.
+template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
+                                       aclOpExecutor**),
+          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
+                              const aclrtStream)>
+void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+// Activation functions template for const aclTensors.
+template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
+                                       uint64_t*, aclOpExecutor**),
+          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
+                              const aclrtStream)>
+void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
+    }
+
+    aclrtStream main_stream = ctx.stream();
+    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+#endif  // CANN_ACLNN_OPS
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
new file mode 100644
index 0000000000000..892a2259b10ae
--- /dev/null
+++ b/ggml/src/ggml-cann/common.h
@@ -0,0 +1,141 @@
+#ifndef CANN_COMMON_H
+#define CANN_COMMON_H
+
+#include <acl/acl.h>
+
+#include <cstdio>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "../include/ggml-cann.h"
+#include "../include/ggml.h"
+
+#define MATRIX_ROW_PADDING 512
+#define GGML_CANN_MAX_STREAMS 8
+
+[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
+                                  const char* file, int line, const char* msg);
+
+// Error handling macro
+#define ACL_CHECK_GEN(stmt, success, error_fn)                                \
+    do {                                                                      \
+        int err_code = (stmt);                                                \
+        if (err_code != (success)) {                                          \
+            ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \
+        }                                                                     \
+    } while (0);
+
+#define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg)
+
+struct ggml_cann_device_info {
+    int32_t device_count;
+
+    // TODO: add more device info later.
+    // struct cann_device_info {
+    //     int     cc;                 // compute capability
+    //     size_t  smpb;               // max. shared memory per block
+    //     bool    vmm;                // virtual memory support
+    //     size_t  vmm_granularity;    // granularity of virtual memory
+    //     size_t  total_vram;
+    // };
+
+    // cann_device_info devices[GGML_CANN_MAX_DEVICES] = {};
+};
+
+const ggml_cann_device_info& ggml_cann_info();
+
+void ggml_cann_set_device(int32_t device);
+int32_t ggml_cann_get_device();
+
+struct ggml_backend_cann_context {
+    int32_t device;
+    std::string name;
+    aclrtEvent copy_event = nullptr;
+
+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {{nullptr}};
+
+    // bind temp buffers to stream. Free after sync.
+    std::multimap<ggml_tensor*, void*> buffers[GGML_CANN_MAX_STREAMS];
+
+    explicit ggml_backend_cann_context(int device)
+        : device(device), name(GGML_CANN_NAME + std::to_string(device)) {}
+
+    ~ggml_backend_cann_context() {
+        if (copy_event != nullptr) {
+            ACL_CHECK(aclrtDestroyEvent(copy_event));
+        }
+        for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
+            if (streams[i] != nullptr) {
+                ACL_CHECK(aclrtDestroyStream(streams[i]));
+                // Buffers should have been freed.
+                GGML_ASSERT(buffers[i].size() == 0);
+            }
+        }
+    }
+
+    void* alloc_buffer(ggml_tensor* dst, size_t size, int stream) {
+        void* buffer;
+        ACL_CHECK(aclrtMalloc(&buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        bind_buffer(dst, buffer, stream);
+        return buffer;
+    }
+
+    void* alloc_buffer(ggml_tensor* dst, size_t size) {
+        return alloc_buffer(dst, size, 0);
+    }
+
+    // Free all buffers bind to all streams.
+    void free_device_buffers() {
+        for (int i = 0; i < GGML_CANN_MAX_STREAMS; i++) {
+            for (auto& it : buffers[i]) {
+                ACL_CHECK(aclrtFree(it.second));
+            }
+            buffers[i].clear();
+        }
+    }
+
+    // Free all buffers bind to stream.
+    void free_stream_buffers(int stream) {
+        for (auto& it : buffers[stream]) {
+            ACL_CHECK(aclrtFree(it.second));
+        }
+        buffers[stream].clear();
+    }
+
+    // Free all buffers belong to dst.
+    // Remove it from stream buffers to avoid double free.
+    void free_tensor_buffers(ggml_tensor* dst) {
+        // ggml_tensor.extra means which stream are tensor in.
+        for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
+            if (streams[i] != nullptr) {
+                for (auto pos = buffers[i].equal_range(dst);
+                     pos.first != pos.second; ++pos.first) {
+                    ACL_CHECK(aclrtFree(pos.first->second));
+                }
+                buffers[i].erase(dst);
+            }
+        }
+    }
+
+    aclrtStream stream(int stream) {
+        if (streams[stream] == nullptr) {
+            ggml_cann_set_device(device);
+            ACL_CHECK(aclrtCreateStream(&streams[stream]));
+        }
+        return streams[stream];
+    }
+
+    // All temp buffers should bind to stream and the dst tensor.
+    // It will be free if:
+    // 1. dst tensor are no longer used any more.
+    // 2. after stream sync.
+    void bind_buffer(ggml_tensor* dst, void* buf, int stream) {
+        buffers[stream].insert(std::make_pair(dst, buf));
+    }
+
+    aclrtStream stream() { return stream(0); }
+};
+
+#endif  // CANN_COMMON_H
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt
new file mode 100644
index 0000000000000..c0119a3ad7908
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -0,0 +1,33 @@
+if (NOT SOC_TYPE)
+    set (SOC_TYPE "Ascend910B3")
+endif()
+
+file(GLOB SRC_FILES
+    get_row_f32.cpp
+    get_row_f16.cpp
+    get_row_q4_0.cpp
+    get_row_q8_0.cpp
+    quantize_f32_q8_0.cpp
+    quantize_f16_q8_0.cpp
+    rope_init_cache.cpp
+    dup.cpp
+)
+
+string(TOLOWER ${SOC_TYPE} SOC_VERSION)
+set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
+set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels STATIC
+    ${SRC_FILES}
+)
+
+#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
new file mode 100644
index 0000000000000..6ea8a97e79575
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@@ -0,0 +1,20 @@
+#ifndef ASCENDC_KERNELS_H
+#define ASCENDC_KERNELS_H
+
+#include "aclrtlaunch_ascendc_get_row_f32.h"
+#include "aclrtlaunch_ascendc_get_row_f16.h"
+#include "aclrtlaunch_ascendc_get_row_q8_0.h"
+#include "aclrtlaunch_ascendc_get_row_q4_0.h"
+
+#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
+#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
+
+#include "aclrtlaunch_ascendc_rope_init_cache.h"
+#include "rope.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
+#include "dup.h"
+
+#endif  // ASCENDC_KERNELS_H
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp
new file mode 100644
index 0000000000000..3da38d918bda7
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -0,0 +1,211 @@
+#include "kernel_operator.h"
+#include "dup.h"
+
+#include <cmath>
+
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+template <typename SRC_T, typename DST_T>
+class DupByRows {
+   public:
+    __aicore__ inline DupByRows() {}
+    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, dup_param& param) {
+        /* Dup by rows when src is contigous on first dimension and dst is 
+        contiguous, each kernel process one row.
+        */
+
+        // Input has four dims.
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        // param
+        num_rows = param.src_ne[1] * param.src_ne[2] * param.src_ne[3];
+        num_elem = param.src_ne[0];
+        
+        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3) 
+        idx_ne3 = op_block_idx / (param.src_ne[1] * param.src_ne[2]);
+        idx_ne2 = (op_block_idx - idx_ne3 * (param.src_ne[1] * param.src_ne[2])) 
+                  / (param.src_ne[1]);
+        idx_ne1 = op_block_idx - idx_ne3 * (param.src_ne[1] * param.src_ne[2]) 
+                - idx_ne2 * param.src_ne[1];
+
+        // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
+        src_stride = param.src_nb[3] * idx_ne3 + param.src_nb[2] * idx_ne2
+                     + param.src_nb[1] * idx_ne1;
+        
+        // dst is contiguous
+        dst_stride = (idx_ne3 * (param.src_ne[1] * param.src_ne[2]) + 
+                      idx_ne2 * param.src_ne[1] + 
+                      idx_ne1) * (param.src_ne[0] * sizeof(DST_T));
+        
+        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src + 
+                                                                src_stride));
+        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst + 
+                                                                dst_stride));
+
+        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem + 
+                                                32 - 1) / 32 * 32);
+        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem + 
+                                                32 - 1) / 32 * 32);
+    }
+
+    __aicore__ inline void copy_in() {
+        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
+        
+        DataCopyExtParams dataCopyParams;
+        dataCopyParams.blockCount = 1;
+        dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
+        DataCopyPadExtParams<SRC_T> padParams;
+        DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
+        
+        src_queue.EnQue(src_local);
+    }
+
+    __aicore__ inline void copy_out() {
+        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
+        
+        DataCopyExtParams dataCopyParams;
+        dataCopyParams.blockCount = 1;
+        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
+        DataCopyPad(dst_gm, dst_local, dataCopyParams);
+
+        dst_queue.FreeTensor(dst_local);
+    }
+
+    __aicore__ inline void dup() {
+        // main process, copy one row data from src to dst.
+        copy_in();
+        
+        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
+        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
+        
+        int32_t BLOCK_NUM = 32 / sizeof(DST_T);
+        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1) 
+                                        / BLOCK_NUM * BLOCK_NUM);    
+        dst_queue.EnQue<DST_T>(dst_local);
+
+        src_queue.FreeTensor(src_local);
+        copy_out();
+    }
+
+    __aicore__ inline void dup_with_cast() {
+        // main process, copy one row data from src to dst.
+        // cast dtype from src to dst.
+        copy_in();
+        
+        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
+        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
+        
+        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem); 
+        dst_queue.EnQue<DST_T>(dst_local);
+
+        src_queue.FreeTensor(src_local);
+        copy_out();
+    }
+
+   private:
+  
+    TPipe pipe;
+    GlobalTensor<SRC_T> src_gm;
+    GlobalTensor<DST_T> dst_gm;
+
+    int64_t num_rows;
+    int64_t num_elem;
+    int64_t idx_ne3;
+    int64_t idx_ne2;
+    int64_t idx_ne1;
+    int64_t src_stride;
+    int64_t dst_stride;
+    
+    TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(GM_ADDR src_gm,
+                                                               GM_ADDR dst_gm,
+                                                               GM_ADDR param) {
+
+    // copy params from gm to ub.
+    dup_param param_ub;
+    auto param_gm_ptr = (__gm__ uint8_t*)param;
+    auto param_ub_ptr = (uint8_t*)&param_ub;
+
+    for (int32_t i = 0; i < static_cast<int32_t>(sizeof(dup_param) / sizeof(uint8_t));
+         ++i, ++param_gm_ptr, ++param_ub_ptr) {
+        *param_ub_ptr = *param_gm_ptr;
+    }
+
+    DupByRows<half, half> op;
+    op.init(src_gm, dst_gm, param_ub);
+    op.dup(); 
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(GM_ADDR src_gm,
+                                                               GM_ADDR dst_gm,
+                                                               GM_ADDR param) {
+
+    // copy params from gm to ub.
+    dup_param param_ub;
+    auto param_gm_ptr = (__gm__ uint8_t*)param;
+    auto param_ub_ptr = (uint8_t*)&param_ub;
+
+    for (int32_t i = 0; i < static_cast<int32_t>(sizeof(dup_param) / sizeof(uint8_t));
+         ++i, ++param_gm_ptr, ++param_ub_ptr) {
+        *param_ub_ptr = *param_gm_ptr;
+    }
+
+    DupByRows<float_t, float_t> op;
+    op.init(src_gm, dst_gm, param_ub);
+    op.dup(); 
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
+                                                               GM_ADDR src_gm,
+                                                               GM_ADDR dst_gm,
+                                                               GM_ADDR param) {
+
+    // copy params from gm to ub.
+    dup_param param_ub;
+    auto param_gm_ptr = (__gm__ uint8_t*)param;
+    auto param_ub_ptr = (uint8_t*)&param_ub;
+
+    for (int32_t i = 0; i < static_cast<int32_t>(sizeof(dup_param) / sizeof(uint8_t));
+         ++i, ++param_gm_ptr, ++param_ub_ptr) {
+        *param_ub_ptr = *param_gm_ptr;
+    }
+
+    DupByRows<float_t, half> op;
+    op.init(src_gm, dst_gm, param_ub);
+    op.dup_with_cast(); 
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
+                                                               GM_ADDR src_gm,
+                                                               GM_ADDR dst_gm,
+                                                               GM_ADDR param) {
+
+    // copy params from gm to ub.
+    dup_param param_ub;
+    auto param_gm_ptr = (__gm__ uint8_t*)param;
+    auto param_ub_ptr = (uint8_t*)&param_ub;
+
+    for (int32_t i = 0; i < static_cast<int32_t>(sizeof(dup_param) / sizeof(uint8_t));
+         ++i, ++param_gm_ptr, ++param_ub_ptr) {
+        *param_ub_ptr = *param_gm_ptr;
+    }
+
+    DupByRows<half, float_t> op;
+    op.init(src_gm, dst_gm, param_ub);
+    op.dup_with_cast(); 
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/dup.h b/ggml/src/ggml-cann/kernels/dup.h
new file mode 100644
index 0000000000000..d58cd15ef2563
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/dup.h
@@ -0,0 +1,14 @@
+#ifndef DUP_H
+#define DUP_H
+
+#pragma pack(push, 8)
+typedef struct {
+    int64_t src_ne[4];
+    int64_t src_nb[4];
+    int64_t dst_ne[4];
+    int64_t dst_nb[4];
+
+} dup_param;
+#pragma pack(pop)
+
+#endif //DUP_H
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
new file mode 100644
index 0000000000000..352aba07e5d50
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@@ -0,0 +1,186 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+class GET_ROW_F16 {
+   public:
+    __aicore__ inline GET_ROW_F16() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
+                                int64_t *output_ne_ub, size_t *output_nb_ub) {
+        // TODO, use template for F16/f32
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ half *)input);
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
+                                             & ~31);
+        uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
+                                              & ~31);
+
+        local_buffer_elems = input_local_buffer_size / sizeof(half);
+
+        // TODO, consider long row that can't put in UB.
+        // All data should asign to 32. It's ok because all data is align to 32.
+        pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
+        pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
+        size_t tail = len % 32; 
+        len = len & ~31;
+        DataCopy(input_local, input_gm[offset], len);
+        if(tail != 0) {
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = tail * sizeof(half);
+            DataCopyPadExtParams<half> padParams;
+            DataCopyPad(input_local[len], input_gm[offset + len], 
+                        dataCopyParams, padParams);
+        }
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        size_t tail = len % 32; 
+        len = len & ~31;
+        DataCopy(output_gm[offset], output_local, len);
+        if(tail != 0) {
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = tail * sizeof(float);
+            DataCopyPad(output_gm[offset + len], output_local[len], 
+                        dataCopyParams);
+        }
+        output_queue.FreeTensor(output_local);
+    }
+    
+    __aicore__ inline void calculate_row(int64_t idx) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3];
+
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3];
+
+        copy_in(input_offset, input_ne[0]);
+        LocalTensor<half> input_local = input_queue.DeQue<half>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+        
+        Cast(output_local, input_local, RoundMode::CAST_NONE, 
+             local_buffer_elems);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset, input_ne[0]);
+
+        input_queue.FreeTensor(input_local);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            calculate_row(i);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    size_t local_buffer_elems;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<half> input_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_f16(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
+    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_F16 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
+            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
new file mode 100644
index 0000000000000..d44d153b335ef
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@@ -0,0 +1,180 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+class GET_ROW_F32 {
+   public:
+    __aicore__ inline GET_ROW_F32() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
+                                int64_t *output_ne_ub, size_t *output_nb_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ float *)input);
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
+        local_buffer_elems = local_buffer_size / sizeof(float);
+
+        // TODO, consider long row that can't put in UB.
+        // All data should asign to 32. It's ok because all data is align to 32.
+        pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
+        pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
+        size_t tail = len % 32; 
+        len = len & ~31;
+        DataCopy(input_local, input_gm[offset], len);
+        if(tail != 0) {
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = tail * sizeof(float);
+            DataCopyPadExtParams<float> padParams;
+            DataCopyPad(input_local[len], input_gm[offset + len], 
+                        dataCopyParams, padParams);
+        }
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        size_t tail = len % 32; 
+        len = len & ~31;
+        DataCopy(output_gm[offset], output_local, len);
+        if(tail != 0) {
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = tail * sizeof(float);
+            DataCopyPad(output_gm[offset + len], output_local[len], 
+                        dataCopyParams);
+        }
+        output_queue.FreeTensor(output_local);
+    }
+    
+    __aicore__ inline void calculate_row(int64_t idx) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3];
+
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3];
+
+        copy_in(input_offset, input_ne[0]);
+        LocalTensor<float> input_local = input_queue.DeQue<float>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+        
+        DataCopy(output_local, input_local, local_buffer_elems);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset, input_ne[0]);
+
+        input_queue.FreeTensor(input_local);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            calculate_row(i);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    size_t local_buffer_elems;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<float> input_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_f32(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
+    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_F32 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
+            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
new file mode 100644
index 0000000000000..d3f229c1980b5
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -0,0 +1,193 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+#define QK4_0 32
+
+class GET_ROW_Q4_0 {
+   public:
+    __aicore__ inline GET_ROW_Q4_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
+                                size_t *indices_nb_ub, int64_t *output_ne_ub,
+                                size_t *output_nb_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+            scale_ne[i] = input_ne_ub[i];
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // one scale for a group.
+        scale_ne[0] /= QK4_0;
+
+        input_stride[0] = 1;
+        scale_stride[0] = 1;
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        group_size_in_row = input_ne[0] / QK4_0;
+        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
+                               input_ne[3] / 2;
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
+        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
+        // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
+        DataCopy(input_local, input_gm[offset], QK4_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        DataCopy(output_gm[offset], output_local, QK4_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3] +
+                                     group * QK4_0;
+        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
+                                     indices_ne1_idx * scale_stride[2] +
+                                     indices_ne2_idx * scale_stride[3] + group;
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3] +
+                                      group * QK4_0;
+
+        copy_in(input_offset);
+        LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        // TODO: cast more data to speed up.
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
+        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
+
+        // Only mul need compile by group.
+        half scale = scale_gm.GetValue(scale_offset);
+
+        Muls(output_local, output_local, (float)scale, QK4_0);
+
+        input_queue.FreeTensor(input_local);
+        cast_queue.FreeTensor(cast_local);
+        output_queue.EnQue(output_local);
+
+        copy_out(output_offset);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                calculate_group(i, j);
+            }
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t scale_ne[4];
+    size_t scale_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t ir;
+    int64_t dr;
+
+    int64_t group_size_in_row;
+
+    TPipe pipe;
+    GlobalTensor<int4b_t> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_Q4_0 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
+            indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
new file mode 100644
index 0000000000000..724669725b5cc
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
@@ -0,0 +1,191 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+#define QK8_0 32
+
+class GET_ROW_Q8_0 {
+   public:
+    __aicore__ inline GET_ROW_Q8_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
+                                size_t *indices_nb_ub, int64_t *output_ne_ub,
+                                size_t *output_nb_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+            scale_ne[i] = input_ne_ub[i];
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // one scale for a group.
+        scale_ne[0] /= QK8_0;
+
+        input_stride[0] = 1;
+        scale_stride[0] = 1;
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        group_size_in_row = input_ne[0] / QK8_0;
+        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
+                               input_ne[3] * sizeof(int8_t);
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
+        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
+        DataCopy(input_local, input_gm[offset], QK8_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        DataCopy(output_gm[offset], output_local, QK8_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3] +
+                                     group * QK8_0;
+        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
+                                     indices_ne1_idx * scale_stride[2] +
+                                     indices_ne2_idx * scale_stride[3] + group;
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3] +
+                                      group * QK8_0;
+
+        copy_in(input_offset);
+        LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        // TODO: cast more data to speed up.
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
+        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
+
+        // Only mul need compile by group.
+        half scale = scale_gm.GetValue(scale_offset);
+        Muls(output_local, output_local, (float)scale, QK8_0);
+
+        input_queue.FreeTensor(input_local);
+        cast_queue.FreeTensor(cast_local);
+        output_queue.EnQue(output_local);
+
+        copy_out(output_offset);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                calculate_group(i, j);
+            }
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t scale_ne[4];
+    size_t scale_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t ir;
+    int64_t dr;
+
+    int64_t group_size_in_row;
+
+    TPipe pipe;
+    GlobalTensor<int8_t> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_Q8_0 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
+            indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+} 
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
new file mode 100644
index 0000000000000..5d33bf07a6303
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@@ -0,0 +1,208 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+#define QK8_0 32
+
+class QUANTIZE_F16_Q8_0 {
+   public:
+    __aicore__ inline QUANTIZE_F16_Q8_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *output_ne_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+        }
+
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+        }
+
+        scale_ne = input_ne;
+        scale_stride[0] = 1;
+        scale_stride[1] = input_ne[0] / QK8_0;
+        for (int i = 2; i < 4; i++) {
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        // split input tensor by rows.
+        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+        dr = nr / op_block_num;
+
+        uint64_t tails = nr % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        group_size_in_row = scale_stride[1];
+        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
+                              output_ne[3] * sizeof(uint8_t);
+
+        input_gm.SetGlobalBuffer((__gm__ half *)input);
+        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
+                                                 group_size_in_row * 
+                                                 sizeof(half)));
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
+        pipe.InitBuffer(work_queue, 1, 32);
+        pipe.InitBuffer(max_queue, 1, 32);
+        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
+        pipe.InitBuffer(scale_queue, 1, 32);
+        pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
+        DataCopy(input_local, input_gm[offset], QK8_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
+        DataCopy(output_gm[offset], output_local, QK8_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+        const int64_t i1 =
+            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+        const int64_t input_offset = i1 * input_stride[1] +
+                                     i2 * input_stride[2] +
+                                     i3 * input_stride[3] + QK8_0 * group;
+
+        const int64_t output_offset = i1 * output_stride[1] +
+                                      i2 * output_stride[2] +
+                                      i3 * output_stride[3] + QK8_0 * group;
+
+        copy_in(input_offset);
+        LocalTensor<half> input_local = input_queue.DeQue<half>();
+        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
+        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
+        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
+
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
+        Abs(abs_local, cast_local, QK8_0);
+        ReduceMax(max_local, abs_local, work_local, QK8_0);
+
+        pipe_barrier(PIPE_ALL);
+        float d = max_local.GetValue(0);
+        d = d / ((1 << 7) - 1);
+        if (d != 0) {
+            Muls(cast_local, cast_local, 1.0f / d, QK8_0);
+        }
+
+        Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset);
+
+        input_queue.FreeTensor(input_local);
+        work_queue.FreeTensor(work_local);
+        abs_queue.FreeTensor(abs_local);
+        max_queue.FreeTensor(max_local);
+        cast_queue.FreeTensor(cast_local);
+        return (half)d;
+    }
+
+    __aicore__ inline void calculate() {
+        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+        uint32_t scale_local_offset = 0;
+        uint32_t scale_global_offset = 0;
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                half scale = calculate_group(i, j);
+                scale_local.SetValue(scale_local_offset++, scale);
+                if (scale_local_offset == 16) {
+                    scale_local_offset = 0;
+                    // TODO: OPTIMIZE ME
+                    pipe_barrier(PIPE_ALL);
+                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
+                    pipe_barrier(PIPE_ALL);
+                    scale_global_offset += 16;
+                }
+            }
+        }
+
+        if (scale_local_offset != 0) {
+            pipe_barrier(PIPE_ALL);
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+            DataCopyPad(scale_gm[scale_global_offset], scale_local, 
+                        dataCopyParams);
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t *scale_ne;
+    size_t scale_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t group_size_in_row;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<half> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int8_t> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, 1> work_queue;
+    TQue<QuePosition::VECOUT, 1> max_queue;
+    TQue<QuePosition::VECIN, 1> abs_queue;
+    TQue<QuePosition::VECOUT, 1> scale_queue;
+    TQue<QuePosition::VECOUT, 1> cast_queue;
+
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_F16_Q8_0 op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
new file mode 100644
index 0000000000000..bdb47231f04f3
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@@ -0,0 +1,206 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+#define QK8_0 32
+
+class QUANTIZE_F32_Q8_0 {
+   public:
+    __aicore__ inline QUANTIZE_F32_Q8_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *output_ne_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+        }
+
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+        }
+
+        scale_ne = input_ne;
+        scale_stride[0] = 1;
+        scale_stride[1] = input_ne[0] / QK8_0;
+        for (int i = 2; i < 4; i++) {
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        // split input tensor by rows.
+        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+        dr = nr / op_block_num;
+
+        uint64_t tails = nr % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        group_size_in_row = scale_stride[1];
+        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
+                              output_ne[3] * sizeof(uint8_t);
+
+        input_gm.SetGlobalBuffer((__gm__ float *)input);
+        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + 
+                                                 ir * group_size_in_row * 
+                                                 sizeof(half)));
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
+        pipe.InitBuffer(work_queue, 1, 32);
+        pipe.InitBuffer(max_queue, 1, 32);
+        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
+        pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
+        pipe.InitBuffer(scale_queue, 1, 32);
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
+        DataCopy(input_local, input_gm[offset], QK8_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
+        DataCopy(output_gm[offset], output_local, QK8_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+        const int64_t i1 =
+            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+        const int64_t input_offset = i1 * input_stride[1] +
+                                     i2 * input_stride[2] +
+                                     i3 * input_stride[3] + QK8_0 * group;
+
+        const int64_t output_offset = i1 * output_stride[1] +
+                                      i2 * output_stride[2] +
+                                      i3 * output_stride[3] + QK8_0 * group;
+
+        copy_in(input_offset);
+        LocalTensor<float> input_local = input_queue.DeQue<float>();
+        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
+        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
+        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+
+        Abs(abs_local, input_local, QK8_0);
+        ReduceMax(max_local, abs_local, work_local, QK8_0);
+        pipe_barrier(PIPE_ALL);
+        float d = max_local.GetValue(0);
+        d = d / ((1 << 7) - 1);
+        if (d != 0) {
+            Muls(input_local, input_local, 1.0f / d, QK8_0);
+        }
+
+        Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset);
+
+        input_queue.FreeTensor(input_local);
+        work_queue.FreeTensor(work_local);
+        abs_queue.FreeTensor(abs_local);
+        max_queue.FreeTensor(max_local);
+        cast_queue.FreeTensor(cast_local);
+
+        return (half)d;
+    }
+
+    __aicore__ inline void calculate() {
+        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+        uint32_t scale_local_offset = 0;
+        uint32_t scale_global_offset = 0;
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                half scale = calculate_group(i, j);
+                scale_local.SetValue(scale_local_offset++, scale);
+                if (scale_local_offset == 16) {
+                    scale_local_offset = 0;
+                    // TODO: OPTIMIZE ME
+                    pipe_barrier(PIPE_ALL);
+                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
+                    pipe_barrier(PIPE_ALL);
+                    scale_global_offset += 16;
+                }
+            }
+        }
+
+        if (scale_local_offset != 0) {
+            pipe_barrier(PIPE_ALL);
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+            DataCopyPad(scale_gm[scale_global_offset], scale_local, 
+                        dataCopyParams);
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t *scale_ne;
+    size_t scale_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t group_size_in_row;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<float> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int8_t> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, 1> work_queue;
+    TQue<QuePosition::VECOUT, 1> max_queue;
+    TQue<QuePosition::VECIN, 1> abs_queue;
+    TQue<QuePosition::VECIN, 1> cast_queue;
+    TQue<QuePosition::VECOUT, 1> scale_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_F32_Q8_0 op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/rope.h b/ggml/src/ggml-cann/kernels/rope.h
new file mode 100644
index 0000000000000..223da0595b0a7
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/rope.h
@@ -0,0 +1,19 @@
+#ifndef ROPE_H
+#define ROPE_H
+
+#pragma pack(push, 8)
+typedef struct {
+    int64_t input_ne[4];
+    int64_t position_ne[4];
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    int n_dims;
+    int n_orig_ctx;
+    float theta_scale;
+    float corr_dims[2];
+    bool is_neox;
+    bool is_glm;
+
+} rope_param;
+#pragma pack(pop)
+
+#endif //ROPE_H
\ No newline at end of file
diff --git a/ggml/src/ggml-cann/kernels/rope_init_cache.cpp b/ggml/src/ggml-cann/kernels/rope_init_cache.cpp
new file mode 100644
index 0000000000000..1d55f8969a483
--- /dev/null
+++ b/ggml/src/ggml-cann/kernels/rope_init_cache.cpp
@@ -0,0 +1,276 @@
+#include "kernel_operator.h"
+#include "rope.h"
+
+#include <cmath>
+
+using namespace AscendC;
+
+#define BUFFER_NUM 1
+
+class InitCache {
+   public:
+    __aicore__ inline InitCache() {}
+    __aicore__ inline void init(GM_ADDR position,
+                                GM_ADDR sin_output,
+                                GM_ADDR cos_output,
+                                rope_param& param,
+                                int64_t* input_ne_ub) {
+        /*Init sin&cos cache for rope, impl of ggml_compute_forward_rope_f32().
+        each kernel process input_ne[0]*1 cache.
+        */
+
+        // Input has four dims. [batch, seq_len, heads, head_dim].
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        // arange param
+        // head_dim = param.input_ne[0];
+        // head_dim = param.input_ne[0];
+        head_dim = input_ne_ub[0];
+        first_value = 0;
+        diff_value = 1;
+        count = head_dim / 2;
+
+        // power param
+        theta_scale = param.theta_scale;
+        
+        // broadcast param
+        // arange_shape: [count, 1] -> broadcast_shape0: [count, 2]
+        arange_shape[0] = count;
+        arange_shape[1] = 1;
+        broadcast_shape0[0] = count;
+        broadcast_shape0[1] = 2;
+
+        // arange_shape1: [1, count] -> broadcast_shape2: [2, count]
+        arange_shape1[0] = 1;
+        arange_shape1[1] = count;
+        broadcast_shape2[0] = 2;
+        broadcast_shape2[1] = count;
+        
+        // position_shape: [1, 1] -> broadcast_shape1: [1, head_dim]
+        position_shape[0] = 1;
+        position_shape[1] = 1;
+        broadcast_shape1[0] = 1;
+        broadcast_shape1[1] = head_dim;
+
+        // position raw and brcst size.
+        position_size = 1;
+        broadcast_size  = broadcast_shape1[0] * broadcast_shape1[1];
+        
+        // other param
+        attn_factor = param.attn_factor;
+        freq_scale = param.freq_scale;
+        is_neox = param.is_neox;
+        is_glm = param.is_glm;
+
+        // stride
+        position_stride = op_block_idx;
+        output_stride = op_block_idx * broadcast_size;
+
+        position_gm.SetGlobalBuffer((__gm__ float_t*)position + position_stride, 
+                                    1);
+        output_sin_gm.SetGlobalBuffer((__gm__ float_t*)sin_output + 
+                                                        output_stride, 
+                                                       broadcast_size);
+        output_cos_gm.SetGlobalBuffer((__gm__ float_t*)cos_output + 
+                                                        output_stride, 
+                                                       broadcast_size);
+        
+        pipe.InitBuffer(power_queue, BUFFER_NUM, 
+                        (sizeof(float_t)*count+32-1)/32*32);
+        pipe.InitBuffer(position_queue, BUFFER_NUM, 
+                        (sizeof(float_t)*position_size+32-1)/32*32);
+        pipe.InitBuffer(arange_queue, BUFFER_NUM, 
+                        (sizeof(float_t)*count+32-1)/32*32);
+        pipe.InitBuffer(sin_mul_mscale_queue, BUFFER_NUM, 
+                        (sizeof(float_t)*broadcast_size+32-1)/32*32);
+        pipe.InitBuffer(cos_mul_mscale_queue, BUFFER_NUM, 
+                        (sizeof(float_t)*broadcast_size+32-1)/32*32);
+        pipe.InitBuffer(broadcast_power_buffer, 
+                        (sizeof(float_t)*broadcast_size+32-1)/32*32);
+        pipe.InitBuffer(theta_buffer, 
+                        (sizeof(float_t)*broadcast_size+32-1)/32*32);
+        pipe.InitBuffer(sin_buffer,
+                        (sizeof(float_t)*broadcast_size+32-1)/32*32);
+        pipe.InitBuffer(cos_buffer, 
+                        (sizeof(float_t)*broadcast_size+32-1)/32*32);
+    }
+
+    __aicore__ inline void copy_in() {
+        LocalTensor<float_t> input_local = 
+                                        position_queue.AllocTensor<float_t>();
+        
+        DataCopyExtParams dataCopyParams;
+        dataCopyParams.blockCount = 1;
+        dataCopyParams.blockLen = position_size * sizeof(float_t);
+        DataCopyPadExtParams<float_t> padParams;
+        DataCopyPad(input_local, position_gm, dataCopyParams, padParams);
+
+        position_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out() {
+        LocalTensor<float_t> sin_local = sin_mul_mscale_queue.DeQue<float_t>();
+        int32_t BLOCK_NUM = 32 / sizeof(float_t);
+        DataCopy(output_sin_gm, sin_local, (broadcast_size + BLOCK_NUM - 1) 
+                                            / BLOCK_NUM * BLOCK_NUM);
+
+        LocalTensor<float_t> cos_local = cos_mul_mscale_queue.DeQue<float_t>();
+        DataCopy(output_cos_gm, cos_local, (broadcast_size + BLOCK_NUM - 1) 
+                                           / BLOCK_NUM * BLOCK_NUM);
+        
+        sin_mul_mscale_queue.FreeTensor(sin_local);
+        cos_mul_mscale_queue.FreeTensor(cos_local);
+    }
+
+    __aicore__ inline void calculate() {
+
+        // arange    
+        LocalTensor<float_t> arange_local = arange_queue.AllocTensor<float_t>();
+        ArithProgression<float_t>(arange_local, first_value, diff_value, count);
+        
+        // pow
+        LocalTensor<float_t> power_local = power_queue.AllocTensor<float_t>();
+        Power<float_t, false>(power_local, static_cast<float_t>(theta_scale), 
+                              arange_local);
+        
+        LocalTensor<float_t> power_brcast_local = 
+                                       broadcast_power_buffer.Get<float_t>();
+
+        //TODO: is_glm==true.
+        if (!is_glm && !is_neox) {    
+            // for :dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
+            //      dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
+            // the value of 0,1 or 2,3, ..., should be same.
+
+            // broadcast: e.g. arange [64, 1] -> [64, 2]
+            BroadCast<float_t, 2, 1>(power_brcast_local, power_local, 
+                                     broadcast_shape0, arange_shape);
+            // position: [1] 
+            copy_in();
+            LocalTensor<float_t> position_local = 
+                                     position_queue.DeQue<float_t>();
+            position_value = position_local.GetValue(0);
+            position_queue.FreeTensor(position_local);
+        }
+        else {
+            // for: dst_data[0]        = x0*cos_theta - x1*sin_theta;
+            //      dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+            // the value of 0,n_dims/2 or 1,n/dims/2+1 should be same.
+
+            // broadcast: e.g. arange [1, 64] -> [2, 64]
+            BroadCast<float_t, 2, 0>(power_brcast_local, power_local, 
+                                     broadcast_shape2, arange_shape1);
+
+            // position * freq_scale
+            copy_in();
+            LocalTensor<float_t> position_local = 
+                                        position_queue.DeQue<float_t>();
+            position_value = position_local.GetValue(0);
+            position_value = position_value * freq_scale;  
+            position_queue.FreeTensor(position_local);     
+        }
+        
+        // theta
+        LocalTensor<float_t> theta_local = theta_buffer.Get<float_t>(); 
+        Muls(theta_local, power_brcast_local, position_value, 
+             broadcast_size);
+
+        // sin & cos
+        // TODO: if ext_factor != 0
+        LocalTensor<float_t> sin_local = sin_buffer.Get<float_t>(); 
+        Sin<float_t, false>(sin_local, theta_local);
+        LocalTensor<float_t> sin_mul_mscale_local = 
+                                    sin_mul_mscale_queue.AllocTensor<float_t>(); 
+        Muls(sin_mul_mscale_local, sin_local, attn_factor, broadcast_size);
+
+        LocalTensor<float_t> cos_local = cos_buffer.Get<float_t>(); 
+        Cos<float_t, false>(cos_local, theta_local);
+        LocalTensor<float_t> cos_mul_mscale_local = 
+                                    cos_mul_mscale_queue.AllocTensor<float_t>(); 
+        Muls(cos_mul_mscale_local, cos_local, attn_factor, broadcast_size);
+
+        // release, VECCALC not need.
+        arange_queue.FreeTensor(arange_local);
+        power_queue.FreeTensor(power_local);
+        
+        // output
+        sin_mul_mscale_queue.EnQue<float_t>(sin_mul_mscale_local);
+        cos_mul_mscale_queue.EnQue<float_t>(cos_mul_mscale_local);
+        copy_out();
+    }
+
+   private:
+
+    int64_t head_dim;
+    float_t first_value;
+    float_t diff_value;
+    int32_t count;
+    float_t theta_scale;
+    float_t attn_factor;
+    float_t freq_scale;
+    bool is_neox;
+    bool is_glm;
+
+    uint32_t broadcast_shape0[2];
+    uint32_t broadcast_shape1[2];
+    uint32_t broadcast_shape2[2];
+    uint32_t position_shape[2];
+    uint32_t arange_shape[2];
+    uint32_t arange_shape1[2];
+    int64_t broadcast_size;
+    int64_t position_size;
+    int64_t position_stride;
+    int64_t output_stride;
+    float_t position_value;
+
+    TPipe pipe;
+    GlobalTensor<float_t> position_gm;
+    GlobalTensor<float_t> output_sin_gm;
+    GlobalTensor<float_t> output_cos_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> arange_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> power_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> position_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> sin_mul_mscale_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> cos_mul_mscale_queue;
+    TBuf<QuePosition::VECCALC> broadcast_power_buffer;
+    TBuf<QuePosition::VECCALC> theta_buffer;
+    TBuf<QuePosition::VECCALC> sin_buffer;
+    TBuf<QuePosition::VECCALC> cos_buffer;
+    
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, int32_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_rope_init_cache(
+                                                          GM_ADDR position_gm,
+                                                          GM_ADDR output_sin_gm,
+                                                          GM_ADDR output_cos_gm,
+                                                          GM_ADDR param,
+                                                          GM_ADDR input_ne_gm
+                                                          ) {
+    // copy params from gm to ub.
+    rope_param param_ub;
+    auto param_gm_ptr = (__gm__ uint8_t*)param;
+    auto param_ub_ptr = (uint8_t*)&param_ub;
+
+    for (int32_t i = 0; i < static_cast<int32_t>(sizeof(rope_param) / sizeof(uint8_t));
+         ++i, ++param_gm_ptr, ++param_ub_ptr) {
+        *param_ub_ptr = *param_gm_ptr;
+    }
+
+    int64_t input_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+
+    InitCache op;
+    op.init(position_gm, output_sin_gm, output_cos_gm, param_ub, input_ne_ub);
+    op.calculate(); 
+}
\ No newline at end of file
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f5502afbe98b3..78fb3c7d9cd84 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3289,7 +3289,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso
 }
 
 // check if t1 can be represented as a repeatition of t0
-static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return ggml_is_empty(t0) ? ggml_is_empty(t1) :
@@ -13563,6 +13563,7 @@ static void ggml_compute_forward_soft_max(
     }
 }
 
+
 // ggml_compute_forward_soft_max_back
 
 static void ggml_compute_forward_soft_max_back_f32(
@@ -21855,6 +21856,14 @@ int ggml_cpu_has_rpc(void) {
 #endif
 }
 
+int ggml_cpu_has_cann(void) {
+#if defined(GGML_USE_CANN)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_gpublas(void) {
     return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
 }
diff --git a/src/llama.cpp b/src/llama.cpp
index 3dc0f85351c50..2ace2c2c921b1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19,6 +19,8 @@
 #  include "ggml-sycl.h"
 #elif defined(GGML_USE_KOMPUTE)
 #   include "ggml-kompute.h"
+#elif defined(GGML_USE_CANN)
+#   include "ggml-cann.h"
 #endif
 
 #ifdef GGML_USE_BLAS
@@ -2639,6 +2641,8 @@ static size_t llama_get_device_count(const llama_model & model) {
     count = ggml_backend_sycl_get_device_count();
 #elif defined(GGML_USE_VULKAN)
     count = ggml_backend_vk_get_device_count();
+#elif defined(GGML_USE_CANN)
+    return ggml_backend_cann_get_device_count();
 #endif
 #if defined(GGML_USE_RPC)
     count += model.rpc_servers.size();
@@ -2671,6 +2675,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
     if (buft == nullptr) {
         LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
     }
+#elif defined(GGML_USE_CANN)
+    buft = ggml_backend_cann_buffer_type(gpu);
 #endif
 
     if (buft == nullptr) {
@@ -2731,6 +2737,11 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
     size_t free;
     ggml_backend_vk_get_device_memory(device, &free, &total);
     return free;
+#elif defined(GGML_USE_CANN)
+    size_t total;
+    size_t free;
+    ggml_backend_cann_get_device_memory(device, &total, &free);
+    return free;
 #else
     return 1;
 #endif
@@ -17059,6 +17070,8 @@ size_t llama_max_devices(void) {
     return GGML_SYCL_MAX_DEVICES;
 #elif defined(GGML_USE_VULKAN)
     return GGML_VK_MAX_DEVICES;
+#elif defined(GGML_USE_CANN)
+    return GGML_CANN_MAX_DEVICES;
 #else
     return 1;
 #endif
@@ -17091,6 +17104,10 @@ void llama_backend_init(void) {
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
+
+#if defined(GGML_USE_CANN)
+    ggml_cann_backend_init();
+#endif
 }
 
 void llama_numa_init(enum ggml_numa_strategy numa) {
@@ -17100,6 +17117,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
 }
 
 void llama_backend_free(void) {
+#if defined(GGML_USE_CANN)
+    ggml_cann_backend_free();
+#endif
+
     ggml_quantize_free();
 }
 
@@ -17389,6 +17410,30 @@ struct llama_context * llama_new_context_with_model(
             }
             ctx->backends.push_back(backend);
         }
+#elif defined(GGML_USE_CANN)
+    // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
+    // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
+    if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+        ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
+        if (backend == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
+            llama_free(ctx);
+            return nullptr;
+        }
+        ctx->backends.push_back(backend);
+    } else {
+        // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
+        // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
+        for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
+            ggml_backend_t backend = ggml_backend_cann_init(device);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
+        }
+    }
 #endif
 
 #ifdef GGML_USE_BLAS
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cfa7073153486..f0b50ba1f5fd4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -123,6 +123,7 @@ llama_target_and_test(test-grammar-integration.cpp)
 llama_target_and_test(test-grad0.cpp)
 # llama_target_and_test(test-opt.cpp) # SLOW
 llama_target_and_test(test-backend-ops.cpp)
+llama_target_and_test(test-backend-runtime.cpp)
 
 llama_target_and_test(test-rope.cpp)
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index f74c0db475e2e..2906e831815af 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -759,7 +759,7 @@ struct test_dup : public test_case {
     }
 
     test_dup(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne = {10, 10, 20, 1},
             std::array<int64_t, 4> permute = {0, 0, 0, 0})
         : type(type), ne(ne), permute(permute),
             _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
@@ -779,9 +779,12 @@ struct test_cpy : public test_case {
     const ggml_type type_src;
     const ggml_type type_dst;
     const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> permute;
+    bool _src_use_permute;
+    bool _dst_use_permute;
 
     std::string vars() override {
-        return VARS_TO_STR3(type_src, type_dst, ne);
+        return VARS_TO_STR6(type_src, type_dst, ne, permute, _src_use_permute, _dst_use_permute);
     }
 
     double max_nmse_err() override {
@@ -793,12 +796,27 @@ struct test_cpy : public test_case {
     }
 
     test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1})
-        : type_src(type_src), type_dst(type_dst), ne(ne) {}
+            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> permute = {0, 0, 0, 0},
+            bool _dst_use_permute = false)
+        : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
+            _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0),
+            _dst_use_permute(_dst_use_permute) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
-        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
+        ggml_tensor * dst;
+        if (_src_use_permute) {
+            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
+        }
+        if (_src_use_permute && _dst_use_permute) {
+            dst = ggml_new_tensor(ctx, type_src, 4, ne.data());
+            dst = ggml_permute(ctx, dst, permute[0], permute[1], permute[2], permute[3]);
+        }
+        else {
+            int64_t* dst_ne = src->ne;
+            dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne);
+        }
         ggml_tensor * out = ggml_cpy(ctx, src, dst);
         return out;
     }
@@ -1174,6 +1192,7 @@ struct test_soft_max : public test_case {
     }
 };
 
+
 // GGML_OP_ROPE
 struct test_rope : public test_case {
     const ggml_type type;
@@ -2109,12 +2128,24 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
     test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
     test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst no contiguous
     test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
     test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
 
     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
         for (ggml_type type_dst : all_types) {
            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
+           test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // test cpy by rows
+        }
+    }
+
+    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+           test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // dst contiguous
+           test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3}, true)); // dst no contiguous
         }
     }
 
@@ -2246,7 +2277,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         for (int n = 0; n < 10; ++n) {
             int64_t ne0 = dist_ne0(rng);
             int64_t ne1 = dist_ne1(rng);
-            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
         }
 
         exponent <<= 1;
@@ -2265,7 +2296,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
             }
         }
     }
-
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 8.0f));
diff --git a/tests/test-backend-runtime.cpp b/tests/test-backend-runtime.cpp
new file mode 100644
index 0000000000000..c08f81500d56a
--- /dev/null
+++ b/tests/test-backend-runtime.cpp
@@ -0,0 +1,185 @@
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+#include <ggml.h>
+
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+
+struct test_case {
+    virtual const char* case_desc() = 0;
+    virtual bool eval(ggml_backend_t backend) = 0;
+    std::vector<float> get_random_float(size_t size) {
+        std::vector<float> random_data;
+        random_data.resize(size);
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<> dis(0, 128);
+        for (size_t i = 0; i < size; i++) {
+            random_data[i] = dis(gen);
+        }
+        return random_data;
+    }
+
+    void init_context(size_t tensor_count) {
+        ggml_init_params params = {
+            /* .mem_size = */ ggml_tensor_overhead() * tensor_count,
+            /* .mem_base = */ NULL,
+            /* .no_alloc = */ true,
+        };
+        ctx = ggml_init(params);
+    }
+
+    ggml_tensor* new_tensor(ggml_backend_t backend, int dims, int64_t* ne) {
+        ggml_tensor* tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, dims, ne);
+        ggml_backend_buffer_t buf =
+            ggml_backend_alloc_ctx_tensors(ctx, backend);
+        if (buf == NULL) {
+            printf("failed to allocate tensors [%s] ",
+                   ggml_backend_name(backend));
+            ggml_free(ctx);
+            return nullptr;
+        }
+        return tensor;
+    }
+
+    ggml_context* ctx;
+};
+
+struct test_tensor_get_set_cpy_async : public test_case {
+    virtual const char* case_desc() { return "test_tensor_get_set_cpy_async"; }
+    virtual bool eval(ggml_backend_t backend) {
+        // init context
+        init_context(2);
+
+        // alloc tensor
+        int64_t ne[] = {10, 10};
+        ggml_tensor* tensor1 =
+            new_tensor(backend, sizeof(ne) / sizeof(ne[0]), ne);
+        ggml_tensor* tensor2 =
+            new_tensor(backend, sizeof(ne) / sizeof(ne[0]), ne);
+
+        // get random data
+        int64_t elements = ggml_nelements(tensor1);
+        std::vector<float> random_data = get_random_float(elements);
+        std::vector<float> verify_data;
+        verify_data.resize(elements);
+
+        // upload and download data
+        ggml_backend_tensor_set_async(backend, tensor1,
+                                      (void*)random_data.data(), 0,
+                                      ggml_nbytes(tensor1));
+        ggml_backend_tensor_copy_async(backend, backend, tensor1, tensor2);
+        ggml_backend_tensor_get_async(backend, tensor2,
+                                      (void*)verify_data.data(), 0,
+                                      ggml_nbytes(tensor2));
+        ggml_backend_synchronize(backend);
+
+        return (memcmp(random_data.data(), verify_data.data(),
+                       sizeof(float) * elements) == 0);
+    }
+};
+
+struct test_tensor_get_set_cpy : public test_case {
+    virtual const char* case_desc() { return "test_tensor_get_set_cpy"; }
+    virtual bool eval(ggml_backend_t backend) {
+        // init context
+        init_context(2);
+
+        // alloc tensor
+        int64_t ne[] = {10, 10};
+        ggml_tensor* tensor1 =
+            new_tensor(backend, sizeof(ne) / sizeof(ne[0]), ne);
+        ggml_tensor* tensor2 =
+            new_tensor(backend, sizeof(ne) / sizeof(ne[0]), ne);
+
+        // get random data
+        int64_t elements = ggml_nelements(tensor1);
+        std::vector<float> random_data = get_random_float(elements);
+        std::vector<float> verify_data;
+        verify_data.resize(elements);
+
+        // upload and download data
+        ggml_backend_tensor_set(tensor1, (void*)random_data.data(), 0,
+                                ggml_nbytes(tensor1));
+        ggml_backend_tensor_copy(tensor1, tensor2);
+        ggml_backend_tensor_get(tensor2, (void*)verify_data.data(), 0,
+                                ggml_nbytes(tensor2));
+
+        return (memcmp(random_data.data(), verify_data.data(),
+                       sizeof(float) * elements) == 0);
+    }
+};
+
+static bool test_backend(ggml_backend_t backend) {
+    std::vector<std::unique_ptr<test_case>> test_cases;
+    test_cases.emplace_back(new test_tensor_get_set_cpy_async());
+    test_cases.emplace_back(new test_tensor_get_set_cpy());
+
+    size_t n_ok = 0;
+    for (auto& test : test_cases) {
+        printf("  %s ", test->case_desc());
+        if (test->eval(backend)) {
+            n_ok++;
+            printf("\033[1;32mOK\033[0m\n");
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+    }
+    printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
+
+    return n_ok == test_cases.size();
+}
+
+static void usage(char** argv) { printf("Usage: %s [-b backend]\n", argv[0]); }
+
+int main(int argc, char** argv) {
+    const char* backend = NULL;
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "-b") == 0) {
+            if (i + 1 < argc) {
+                backend = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
+        } else {
+            usage(argv);
+            return 1;
+        }
+    }
+
+    printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
+
+    size_t n_ok = 0;
+    for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
+        if (backend != NULL &&
+            strcmp(backend, ggml_backend_reg_get_name(i)) != 0) {
+            printf("  Skipping %s\n\n", ggml_backend_reg_get_name(i));
+            n_ok++;
+            continue;
+        }
+
+        ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
+
+        GGML_ASSERT(backend != NULL);
+        printf("  Backend name: %s\n", ggml_backend_name(backend));
+
+        bool ok = test_backend(backend);
+
+        printf("  Backend %s: ", ggml_backend_name(backend));
+        if (ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            n_ok++;
+        } else {
+            printf("\033[1;31mFAIL\033[0m\n");
+        }
+
+        printf("\n");
+    }
+
+    return 0;
+}
\ No newline at end of file