diff --git a/common/common.cpp b/common/common.cpp index 57d03a5789edd..16b41b4d1948a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -60,6 +60,10 @@ #define GGML_USE_CUDA_SYCL_VULKAN #endif +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_CANN) +#define GGML_USE_CUDA_SYCL_CANN +#endif + #if defined(LLAMA_USE_CURL) #ifdef __linux__ #include diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d641a9f12b388..a6497b6e0bf82 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -23,6 +23,10 @@ #include "ggml-cuda.h" #include "ggml-sycl.h" +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + // utils static uint64_t get_time_ns() { using clock = std::chrono::high_resolution_clock; @@ -120,6 +124,17 @@ static std::string get_gpu_info() { id += "/"; } } +#endif +#ifdef GGML_USE_CANN + uint32_t count = ggml_backend_cann_get_device_count(); + for (uint32_t i = 0; i < count; i++) { + char buf[128]; + ggml_backend_cann_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; + } + } #endif // TODO: other backends return id; diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index d6882eec31e1a..4571181879ae8 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -16,6 +16,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_TEE("%s: CLIP using Metal backend\n", __func__); #endif +#ifdef GGML_USE_CANN + new_clip->backend = ggml_backend_cann_init(0); + printf("%s: CLIP using CANN backend\n", __func__); +#endif + if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h new file mode 100644 index 0000000000000..a15d6020bbaac --- /dev/null +++ b/ggml/include/ggml-cann.h @@ -0,0 +1,46 @@ +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#define GGML_CANN_NAME "CANN" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_CANN_MAX_DEVICES 16 + +#define QK4_0 32 +typedef struct { + uint16_t d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; + + +#define QK8_0 32 +typedef struct { + uint16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; + +// backend API +GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device); + +GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend); + +// device buffer +GGML_API GGML_CALL ggml_backend_buffer_type_t +ggml_backend_cann_buffer_type(int32_t device); + +GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void); +GGML_API GGML_CALL void ggml_backend_cann_get_device_description( + int32_t device, char* description, size_t description_size); +GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, + size_t* free, + size_t* total); +void ggml_cann_backend_init(void); +void ggml_cann_backend_free(void); +#ifdef __cplusplus +} +#endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d895c9acdb596..2e534f3f87110 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -747,6 +747,8 @@ extern "C" { GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); @@ -2391,6 +2393,7 @@ extern "C" { GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_matmul_int8(void); + GGML_API int ggml_cpu_has_cann (void); // // Internal types and functions exposed for tests and benchmarks diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index d0f4097d8cd0c..cd226835caa89 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -737,6 +737,74 @@ if (GGML_CPU_HBM) target_link_libraries(ggml PUBLIC memkind) endif() +if (LLAMA_CANN) + if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME}) + set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME}) + message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}") + endif() + + if (CANN_INSTALL_DIR) + # Only Support Linux. + if (LLAMA_CANN) + if (NOT UNIX) + set(LLAMA_CANN OFF) + message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off LLAMA_CANN") + endif() + endif() + + # Supported platforms: x86-64, arm64 + if (LLAMA_CANN) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + else() + set(LLAMA_CANN OFF) + message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off LLAMA_CANN") + endif() + endif() + + # Set header and libs + if(LLAMA_CANN) + set(CANN_INCLUDE_DIRS + ${CANN_INSTALL_DIR}/include + ${CANN_INSTALL_DIR}/include/aclnn + ${CANN_INSTALL_DIR}/acllib/include + ) + + # TODO: find libs + link_directories( + ${CANN_INSTALL_DIR}/lib64 + ) + + add_subdirectory(ggml-cann/kernels) + list(APPEND CANN_LIBRARIES + ascendcl + nnopbase + opapi + acl_op_compiler + ascendc_kernels + ) + + set(GGML_HEADERS_CANN "../include/ggml-cann.h") + file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp") + list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp") + + message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") + message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} ) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS}) + list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN) + endif() + else() + set(LLAMA_CANN OFF) + message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off LLAMA_CANN") + endif() + + if(NOT LLAMA_CANN) + message(WARNING "CANN: LLAMA_CANN is turned OFF, see above for details.") + endif() +endif() + function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") @@ -1151,6 +1219,7 @@ add_library(ggml ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} + ${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN} ) if (EMSCRIPTEN) diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 13c71c310c446..fb72060a8ac74 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -445,6 +445,11 @@ GGML_CALL static void ggml_backend_registry_init(void) { extern GGML_CALL void ggml_backend_kompute_reg_devices(void); ggml_backend_kompute_reg_devices(); #endif + +#ifdef GGML_USE_CANN + extern GGML_CALL int ggml_backend_cann_reg_devices(void); + ggml_backend_cann_reg_devices(); +#endif } GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp new file mode 100644 index 0000000000000..be9bfa51dccb2 --- /dev/null +++ b/ggml/src/ggml-cann.cpp @@ -0,0 +1,1128 @@ +#include "ggml-cann.h" + +#include + +#include +#include +#include +#include + +#include "ggml-backend-impl.h" +#include "ggml-cann/acl_ops.h" +#include "ggml-cann/aclnn_ops.h" +#include "ggml-cann/common.h" + +[[noreturn]] void ggml_cann_error(const char* stmt, const char* func, + const char* file, int line, const char* msg) { + int32_t id = -1; + aclrtGetDevice(&id); + + fprintf(stderr, "CANN error: %s\n", msg); + fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, + file, line); + fprintf(stderr, " %s\n", stmt); + // abort with GGML_ASSERT to get a stack trace + GGML_ASSERT(!"CANN error"); +} + +void ggml_cann_set_device(const int32_t device) { + // TODO: uncomment these lines after empty context has fixed. + // int current_device; + // ACL_CHECK(aclrtGetDevice(¤t_device)); + + // if (device == current_device) { + // return; + // } + ACL_CHECK(aclrtSetDevice(device)); +} + +int32_t ggml_cann_get_device() { + int32_t id; + ACL_CHECK(aclrtGetDevice(&id)); + return id; +} + +static ggml_cann_device_info ggml_cann_init() { + ggml_cann_device_info info = {}; + + aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count); + + if (err != ACL_SUCCESS) { + fprintf(stderr, "%s: failed to initialize " GGML_CANN_NAME ": %s\n", + __func__, aclGetRecentErrMsg()); + return info; + } + + GGML_ASSERT(info.device_count <= GGML_CANN_MAX_DEVICES); + + // TODO: add more device info later. + return info; +} + +const ggml_cann_device_info& ggml_cann_info() { + static ggml_cann_device_info info = ggml_cann_init(); + return info; +} + +// cann buffer +struct ggml_backend_cann_buffer_context { + int32_t device; + void* dev_ptr = nullptr; + std::string name; + std::vector dev_extra_ptrs; + + ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr) + : device(device), + dev_ptr(dev_ptr), + name(GGML_CANN_NAME + std::to_string(device)) {} + + void* get_extra_ptr(size_t size) { + void* buffer; + ACL_CHECK(aclrtMalloc(&buffer, size, ACL_MEM_MALLOC_HUGE_FIRST)); + dev_extra_ptrs.push_back(buffer); + return buffer; + } + + ~ggml_backend_cann_buffer_context() { + ACL_CHECK(aclrtFree(dev_ptr)); + for (auto dev_extra_ptr : dev_extra_ptrs) { + ACL_CHECK(aclrtFree(dev_extra_ptr)); + } + } +}; + +GGML_CALL static const char* ggml_backend_cann_buffer_get_name( + ggml_backend_buffer_t buffer) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + return ctx->name.c_str(); +} + +GGML_CALL static bool ggml_backend_buffer_is_cann( + ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_cann_buffer_get_name; +} + +GGML_CALL static void ggml_backend_cann_buffer_free_buffer( + ggml_backend_buffer_t buffer) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + delete ctx; +} + +GGML_CALL static void* ggml_backend_cann_buffer_get_base( + ggml_backend_buffer_t buffer) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + return ctx->dev_ptr; +} + +GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, + const void* src, + void* dst) { + GGML_ASSERT(tensor->op == GGML_OP_NONE); + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_0; + size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + + for (int i = 0; i < groups; i++) { + const block_q4_0* group = (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0)); + *scale_offset = group->d; + scale_offset++; + + // 0-15 + for (int j = 0; j < QK4_0 / 2; j += 2) { + (*quant_offset) = (group->qs[j] & 0x0F); + (*quant_offset) |= ((group->qs[j + 1] << 4)); + quant_offset++; + } + + // 16-31 + for (int j = 0; j < QK4_0 / 2; j += 2) { + (*quant_offset) = (group->qs[j] >> 4); + (*quant_offset) |= (group->qs[j + 1] & 0xF0); + quant_offset++; + } + } + + // put (uint4b_t -8) into int4b_t + for (quant_offset = (uint8_t*)dst; + quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) { + (*quant_offset) ^= 0x88; + } +} + +GGML_CALL static void ggml_backend_cann_transform_back_q4_0( + const ggml_tensor* tensor, void* src, void* dst) { + GGML_ASSERT(tensor->op == GGML_OP_NONE); + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_0; + size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; + + uint8_t* quant_offset = (uint8_t*)src; + uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes); + + for (;quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) { + (*quant_offset) ^= 0x88; + } + quant_offset = (uint8_t*)src; + + for (int i = 0; i < groups; i++) { + block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0)); + group->d = *scale_offset; + scale_offset++; + + // 0-15 + for (int j = 0; j < QK4_0 / 2; j += 2) { + group->qs[j] = ((*quant_offset) & 0x0F); + group->qs[j + 1] = ((*quant_offset) >> 4); + quant_offset++; + } + + // 16-31 + for (int j = 0; j < QK4_0 / 2; j += 2) { + group->qs[j] |= ((*quant_offset) << 4); + group->qs[j + 1] |= ((*quant_offset) & 0xF0); + quant_offset++; + } + } +} + +GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, + const void* src, + void* dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + + for (int i = 0; i < groups; i++) { + const block_q8_0* group = (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0)); + *scale_offset = group->d; + scale_offset++; + size_t group_quant_size = QK8_0 * sizeof(uint8_t); + memcpy(quant_offset, group->qs, group_quant_size); + quant_offset += group_quant_size; + } +} + +GGML_CALL static void ggml_backend_cann_transform_back_q8_0( + const ggml_tensor* tensor, const void* src, void* dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + const uint8_t* quant_offset = (const uint8_t*)src; + const uint16_t* scale_offset = (const uint16_t*)((const char*)src + quant_bytes); + + for (int i = 0; i < groups; i++) { + block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0)); + group->d = *scale_offset; + scale_offset++; + size_t group_quant_size = QK8_0 * sizeof(uint8_t); + memcpy(group->qs, quant_offset, group_quant_size); + quant_offset += group_quant_size; + } +} + +GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor, + const void* src, void* dst) { + switch (tensor->type) { + case GGML_TYPE_Q4_0: + ggml_backend_cann_transform_q4_0(tensor, src, dst); + break; + case GGML_TYPE_Q8_0: + ggml_backend_cann_transform_q8_0(tensor, src, dst); + break; + default: + break; + } +} + +GGML_CALL static void ggml_backend_cann_transform_back( + const ggml_tensor* tensor, void* src, void* dst) { + switch (tensor->type) { + case GGML_TYPE_Q4_0: + ggml_backend_cann_transform_back_q4_0(tensor, src, dst); + break; + case GGML_TYPE_Q8_0: + ggml_backend_cann_transform_back_q8_0(tensor, src, dst); + break; + default: + break; + } +} + +GGML_CALL static bool need_transform(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } +} + +static void set_tensor_extra(ggml_backend_buffer_t buffer, + ggml_tensor* tensor) { + // if tensor is need transform, make sure all meta data are copied to + // npu. + // TODO: All tensors should copy meta data to npu, but extra is used to + // record memory usage. Only used for perf test. + size_t tensor_meta_size = sizeof(ggml_tensor); + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + tensor->extra = ctx->get_extra_ptr(tensor_meta_size); + ACL_CHECK(aclrtMemcpy(tensor->extra, tensor_meta_size, tensor, + tensor_meta_size, ACL_MEMCPY_HOST_TO_DEVICE)); +} + +GGML_CALL static void ggml_backend_cann_buffer_init_tensor( + ggml_backend_buffer_t buffer, ggml_tensor* tensor) { + if (tensor->view_src != NULL && tensor->view_offs == 0) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + set_tensor_extra(buffer, tensor); + return; + } + + // TODO: can backend doesn't support quantized yet. Just leave the code + // here. + if (ggml_is_quantized(tensor->type)) { + // Initialize padding to 0 to avoid possible NaN values + size_t original_size = ggml_nbytes(tensor); + size_t padded_size = + ggml_backend_buft_get_alloc_size(buffer->buft, tensor); + + if (padded_size > original_size && tensor->view_src == nullptr) { + size_t memset_size = padded_size - original_size; + ACL_CHECK(aclrtMemset((char*)tensor->data + original_size, + memset_size, 0, memset_size)); + } + } + set_tensor_extra(buffer, tensor); +} + +// TODO: need handle tensor which pas paddings. +GGML_CALL static void ggml_backend_cann_buffer_set_tensor( + ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data, + size_t offset, size_t size) { + GGML_ASSERT(size == ggml_nbytes(tensor)); + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + ggml_cann_set_device(ctx->device); + // TODO: refer to cann(#6017), it use thread's default stream. + // For acl, synchronous functions use this default stream. + // Why aclrtSynchronizeDevice? + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset, size, + ACL_MEMCPY_HOST_TO_DEVICE)); + } else { + void* transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, (const char*)data + offset, + transform_buffer); + +#ifndef NDEBUG + void* check_buffer = malloc(size); + ggml_backend_cann_transform_back(tensor, transform_buffer, + check_buffer); + GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) == 0); + free(check_buffer); +#endif + ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size, + ACL_MEMCPY_HOST_TO_DEVICE)); + free(transform_buffer); + } +} + +GGML_CALL static void ggml_backend_cann_buffer_get_tensor( + ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, + size_t offset, size_t size) { + GGML_ASSERT(size == ggml_nbytes(tensor)); + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + ggml_cann_set_device(ctx->device); + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size, + ACL_MEMCPY_DEVICE_TO_HOST)); + } else { + void* transform_buffer = malloc(size); + ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size, + ACL_MEMCPY_DEVICE_TO_HOST)); + ggml_backend_cann_transform_back(tensor, transform_buffer, + (char*)data + offset); + free(transform_buffer); + } +} + +GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor( + ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) { + if (ggml_backend_buffer_is_cann(src->buffer)) { + ggml_backend_cann_buffer_context* src_ctx = + (ggml_backend_cann_buffer_context*)src->buffer->context; + ggml_backend_cann_buffer_context* dst_ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + size_t memcpy_size = ggml_nbytes(src); + // Same device. + if (src_ctx->device == dst_ctx->device) { + ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size, + (const char*)src->data, memcpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE)); + return true; + } else { + // Different device but can access by peer. + int32_t canAccessPeer = 0; + ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, + dst_ctx->device)); + if (canAccessPeer) { + ggml_cann_set_device(src_ctx->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0)); + ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size, + (const char*)src->data, memcpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE)); + return true; + } + } + } + return false; +} + +GGML_CALL static void ggml_backend_cann_buffer_clear( + ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + ggml_cann_set_device(ctx->device); + ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size)); +} + +static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { + /* .get_name = */ ggml_backend_cann_buffer_get_name, + /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer, + /* .get_base = */ ggml_backend_cann_buffer_get_base, + /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_cann_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cann_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cann_buffer_clear, + /* .reset = */ NULL, +}; + +// cann buffer type +struct ggml_backend_cann_buffer_type_context { + int32_t device; + std::string name; +}; + +GGML_CALL static const char* ggml_backend_cann_buffer_type_name( + ggml_backend_buffer_type_t buft) { + ggml_backend_cann_buffer_type_context* ctx = + (ggml_backend_cann_buffer_type_context*)buft->context; + + return ctx->name.c_str(); +} + +GGML_CALL static ggml_backend_buffer_t +ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + ggml_backend_cann_buffer_type_context* buft_ctx = + (ggml_backend_cann_buffer_type_context*)buft->context; + + ggml_cann_set_device(buft_ctx->device); + + size = std::max(size, (size_t)1); + + void* dev_ptr; + aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); + if (err != ACL_SUCCESS) { + fprintf( + stderr, + "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", + __func__, size / 1024.0 / 1024.0, buft_ctx->device, + aclGetRecentErrMsg()); + return nullptr; + } + + ggml_backend_cann_buffer_context* ctx = + new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr); + + return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, + ctx, size); +} + +GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment( + ggml_backend_buffer_type_t buft) { + return 128; + + GGML_UNUSED(buft); +} + +GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size( + ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { + size_t size = ggml_nbytes(tensor); + int64_t ne0 = tensor->ne[0]; + + // last line must bigger than 32, because every single op deal at + // least 32 bytes. + // TODO: quantized type? + // int64_t line_size = ne0 * ggml_element_size(tensor); + // int64_t line_size_align_32 = (line_size + 31) & ~31; + // size += (line_size_align_32 - line_size); + + // TODO: not support quantized yet. + // TODO: consider un-continue tensor. + if (ggml_is_quantized(tensor->type)) { + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size( + tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return size; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = { + /* .get_name = */ ggml_backend_cann_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size, + /* .is_host = */ NULL, +}; + +GGML_CALL ggml_backend_buffer_type_t +ggml_backend_cann_buffer_type(int32_t device) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + if (device >= ggml_backend_cann_get_device_count()) { + return nullptr; + } + + static ggml_backend_buffer_type + ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES]; + + static bool ggml_backend_cann_buffer_type_initialized = false; + + if (!ggml_backend_cann_buffer_type_initialized) { + for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { + ggml_backend_cann_buffer_types[i] = { + /* .iface = */ ggml_backend_cann_buffer_type_interface, + /* .context = */ + new ggml_backend_cann_buffer_type_context{ + i, GGML_CANN_NAME + std::to_string(i)}, + }; + } + ggml_backend_cann_buffer_type_initialized = true; + } + + return &ggml_backend_cann_buffer_types[device]; +} + +static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, + struct ggml_tensor* dst) { + switch (dst->op) { + case GGML_OP_REPEAT: + ggml_cann_repeat(ctx, dst); + break; + case GGML_OP_GET_ROWS: + ggml_cann_get_rows(ctx, dst); + break; + case GGML_OP_DUP: + ggml_cann_dup(ctx, dst); + break; + case GGML_OP_ADD: + ggml_cann_add(ctx, dst); + break; + case GGML_OP_ACC: + ggml_cann_acc(ctx, dst); + break; + case GGML_OP_MUL: + ggml_cann_mul_div(ctx, dst); + break; + case GGML_OP_DIV: + ggml_cann_mul_div(ctx, dst); + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(dst)) { + case GGML_UNARY_OP_GELU: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_SILU: + ggml_cann_activation( + ctx, dst); + break; + // TODO: Use faster gelu?? + case GGML_UNARY_OP_GELU_QUICK: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_TANH: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_RELU: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_HARDSIGMOID: + ggml_cann_activation(ctx, dst); + break; + case GGML_UNARY_OP_HARDSWISH: + ggml_cann_activation(ctx, dst); + break; + default: + return false; + } + break; + case GGML_OP_NORM: + ggml_cann_norm(ctx, dst); + break; + case GGML_OP_GROUP_NORM: + ggml_cann_group_norm(ctx, dst); + break; + case GGML_OP_CONCAT: + ggml_cann_concat(ctx, dst); + break; + case GGML_OP_UPSCALE: + ggml_cann_upsample_nearest2d(ctx, dst); + break; + case GGML_OP_PAD: + ggml_cann_pad(ctx, dst); + break; + case GGML_OP_ARANGE: + ggml_cann_arange(ctx, dst); + break; + case GGML_OP_TIMESTEP_EMBEDDING: + ggml_cann_timestep_embedding(ctx, dst); + break; + case GGML_OP_LEAKY_RELU: + ggml_cann_leaky_relu(ctx, dst); + break; + case GGML_OP_RMS_NORM: + ggml_cann_rms_norm(ctx, dst); + break; + case GGML_OP_MUL_MAT: + ggml_cann_mul_mat(ctx, dst); + break; + case GGML_OP_MUL_MAT_ID: + return false; + case GGML_OP_SCALE: + ggml_cann_scale(ctx, dst); + break; + case GGML_OP_SQR: + ggml_cann_sqr(ctx, dst); + break; + case GGML_OP_CLAMP: + ggml_cann_clamp(ctx, dst); + break; + case GGML_OP_CPY: + ggml_cann_cpy(ctx, dst); + break; + case GGML_OP_CONT: + ggml_cann_dup(ctx, dst); + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + // Do nothing with these ops. + break; + case GGML_OP_DIAG_MASK_INF: + ggml_cann_diag_mask(ctx, dst, -INFINITY); + break; + case GGML_OP_SOFT_MAX: + ggml_cann_softmax(ctx, dst); + break; + case GGML_OP_ROPE: + ggml_cann_rope(ctx, dst); + break; + case GGML_OP_IM2COL: + ggml_cann_im2col(ctx, dst); + break; + case GGML_OP_POOL_2D: + ggml_cann_pool2d(ctx, dst); + break; + case GGML_OP_SUM_ROWS: + ggml_cann_sum_rows(ctx, dst); + break; + case GGML_OP_ARGSORT: + ggml_cann_argsort(ctx, dst); + break; + default: + return false; + } + + return true; +} + +// backend +GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + return cann_ctx->name.c_str(); +} + +GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + ACL_CHECK(aclrtSynchronizeDevice()); + cann_ctx->free_device_buffers(); + ACL_CHECK(aclrtResetDevice(cann_ctx->device)); + delete cann_ctx; + delete backend; +} + +GGML_CALL static ggml_backend_buffer_type_t +ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + return ggml_backend_cann_buffer_type(cann_ctx->device); +} + +GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, + ggml_tensor* tensor, + const void* data, + size_t offset, + size_t size) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, (const char*)data + offset, + size, ACL_MEMCPY_HOST_TO_DEVICE, + cann_ctx->stream())); + } else { + void* transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, (const char*)data + offset, + transform_buffer); + +#ifndef NDEBUG + void* check_buffer = malloc(size); + ggml_backend_cann_transform_back(tensor, transform_buffer, + check_buffer); + GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size)); + free(check_buffer); +#endif + ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size, + ACL_MEMCPY_HOST_TO_DEVICE, + cann_ctx->stream())); + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); + free(transform_buffer); + } +} + +GGML_CALL static void ggml_backend_cann_get_tensor_async( + ggml_backend_t backend, const ggml_tensor* tensor, void* data, + size_t offset, size_t size) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + ggml_backend_buffer_t buf = + tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + + GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && + "unsupported buffer type"); + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data, + size, ACL_MEMCPY_DEVICE_TO_HOST, + cann_ctx->stream())); + } else { + void* transform_buffer = malloc(size); + ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size, + ACL_MEMCPY_DEVICE_TO_HOST, + cann_ctx->stream())); + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); + ggml_backend_cann_transform_back(tensor, transform_buffer, + (char*)data + offset); + free(transform_buffer); + } +} + +GGML_CALL static bool ggml_backend_cann_cpy_tensor_async( + ggml_backend_t backend_src, ggml_backend_t backend_dst, + const ggml_tensor* src, ggml_tensor* dst) { + GGML_ASSERT(ggml_backend_is_cann(backend_src) || + ggml_backend_is_cann(backend_dst)); + + if (!ggml_backend_buffer_is_cann(src->buffer) || + !ggml_backend_buffer_is_cann(dst->buffer)) { + return false; + } + + ggml_backend_buffer_t buf_src = + src->view_src ? src->view_src->buffer : src->buffer; + ggml_backend_buffer_t buf_dst = + dst->view_src ? dst->view_src->buffer : dst->buffer; + + ggml_backend_cann_context* cann_ctx_src = + (ggml_backend_cann_context*)backend_src->context; + ggml_backend_cann_context* cann_ctx_dst = + (ggml_backend_cann_context*)backend_dst->context; + + size_t copy_size = ggml_nbytes(dst); + if (backend_src != backend_dst) { + ggml_backend_cann_buffer_context* buf_ctx_src = + (ggml_backend_cann_buffer_context*)buf_src->context; + ggml_backend_cann_buffer_context* buf_ctx_dst = + (ggml_backend_cann_buffer_context*)buf_dst->context; + + GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device); + GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device); + + int32_t canAccessPeer = 0; + ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, + cann_ctx_dst->device)); + if (!canAccessPeer) { + return false; + } + + ggml_cann_set_device(cann_ctx_src->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0)); + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, + cann_ctx_dst->stream())); + + // record event on src stream + if (!cann_ctx_src->copy_event) { + ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event)); + } + + ACL_CHECK( + aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream())); + + // wait on dst stream for the copy to complete + ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), + cann_ctx_src->copy_event)); + } else { + // src and dst are on the same backend + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, + cann_ctx_dst->stream())); + } + + return true; +} + +GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + ggml_cann_set_device(cann_ctx->device); + + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); + + // Free temp buffers binding to stream. + cann_ctx->free_stream_buffers(0); +} + +GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( + ggml_backend_t backend, ggml_cgraph* cgraph) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + ggml_cann_set_device(cann_ctx->device); + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor* node = cgraph->nodes[i]; + + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || + node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + + // if tensor is reused, free temp buffers first. + cann_ctx->free_tensor_buffers(node); + bool ok = ggml_cann_compute_forward(*cann_ctx, node); + + if (!ok) { + fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, + node->name, ggml_op_name(node->op)); + } + // if not synchronize, aclrtSynchronizeStream in + // ggml_backend_cann_synchronize() will raise error. + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); + GGML_ASSERT(ok); + } + + return GGML_STATUS_SUCCESS; +} + +GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, + const ggml_tensor* op) { + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_TANH: + return true; + default: + return false; + } + case GGML_OP_MUL_MAT: { + switch (op->src[0]->type) { + // case GGML_TYPE_Q4_0: + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } + case GGML_OP_MUL_MAT_ID: + return false; + // embedding + case GGML_OP_GET_ROWS: { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } break; + case GGML_OP_CPY: { + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } + case GGML_OP_DUP: + case GGML_OP_REPEAT: + case GGML_OP_CONCAT: + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_CONT: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + case GGML_OP_IM2COL: + case GGML_OP_POOL_2D: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_GROUP_NORM: + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_ARANGE: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: + return true; + default: + return false; + } + + GGML_UNUSED(backend); +} + +static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cann_buffer_type_name; +} + +GGML_CALL static bool ggml_backend_cann_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (ggml_backend_buft_is_cann(buft)) { + ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *)backend->context; + ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *)buft->context; + return buft_ctx->device == cann_ctx->device; + } + + return false; +} + +GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend, + const ggml_tensor* op) { + const int min_batch_size = 32; + GGML_UNUSED(backend); + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; +} + +static ggml_backend_event_t ggml_backend_cann_event_new( + ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + ggml_cann_set_device(cann_ctx->device); + + aclrtEvent event; + ACL_CHECK(aclrtCreateEvent(&event)); + + return new ggml_backend_event{ + /* .backend = */ backend, + /* .context = */ event, + }; +} + +static void ggml_backend_cann_event_free(ggml_backend_event_t event) { + ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context)); + + delete event; +} + +static void ggml_backend_cann_event_record(ggml_backend_event_t event) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)event->backend->context; + + ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream())); +} + +static void ggml_backend_cann_event_wait(ggml_backend_t backend, + ggml_backend_event_t event) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + if (ggml_backend_is_cann(event->backend)) { + ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), + (aclrtEvent)event->context)); + } else { + GGML_ASSERT(false); + } +} + +static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) { + ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context)); +} + +static ggml_backend_i ggml_backend_cann_interface = { + /* .get_name = */ ggml_backend_cann_name, + /* .free = */ ggml_backend_cann_free, + /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type, + /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async, + /* .synchronize = */ ggml_backend_cann_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_cann_graph_compute, + /* .supports_op = */ ggml_backend_cann_supports_op, + /* .supports_buft = */ ggml_backend_cann_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ ggml_backend_cann_event_new, + /* .event_free = */ ggml_backend_cann_event_free, + /* .event_record = */ ggml_backend_cann_event_record, + /* .event_wait = */ ggml_backend_cann_event_wait, + /* .event_synchronize = */ ggml_backend_cann_event_synchronize, +}; + +static ggml_guid_t ggml_backend_cann_guid() { + static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34, + 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64}; + return &guid; +} + +GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) { + if (device < 0 || device >= ggml_backend_cann_get_device_count()) { + fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); + return nullptr; + } + + ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device); + if (ctx == nullptr) { + fprintf(stderr, "%s: error: failed to allocate context\n", __func__); + return nullptr; + } + + ggml_backend_t cann_backend = + new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), + /* .interface = */ ggml_backend_cann_interface, + /* .context = */ ctx}; + + return cann_backend; +} + +GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) { + return backend != NULL && + ggml_guid_matches(backend->guid, ggml_backend_cann_guid()); +} + +GGML_CALL int32_t ggml_backend_cann_get_device_count() { + return ggml_cann_info().device_count; +} + +GGML_CALL void ggml_backend_cann_get_device_description( + int32_t device, char* description, size_t description_size) { + ggml_cann_set_device(device); + const char* soc_name = aclrtGetSocName(); + snprintf(description, description_size, "%s", soc_name); +} + +GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, + size_t* total) { + ggml_cann_set_device(device); + ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); +} + +// backend registry +GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params, + void* user_data) { + ggml_backend_t cann_backend = + ggml_backend_cann_init((int)(intptr_t)user_data); + return cann_backend; + + GGML_UNUSED(params); +} + +extern "C" GGML_CALL int ggml_backend_cann_reg_devices(); + +GGML_CALL int ggml_backend_cann_reg_devices() { + aclInit(nullptr); + uint32_t device_count = ggml_backend_cann_get_device_count(); + // initialization + for (uint32_t i = 0; i < device_count; i++) { + char name[128]; + snprintf(name, sizeof(name), "%s%d", GGML_CANN_NAME, i); + ggml_backend_register(name, ggml_backend_reg_cann_init, + ggml_backend_cann_buffer_type(i), + (void*)(intptr_t)i); + } + return device_count; +} + +void ggml_cann_backend_init(void) { ACL_CHECK(aclInit(nullptr)); } + +void ggml_cann_backend_free(void) { ACL_CHECK(aclFinalize()); } diff --git a/ggml/src/ggml-cann/acl_ops.cpp b/ggml/src/ggml-cann/acl_ops.cpp new file mode 100644 index 0000000000000..fac9ea1ae4c3d --- /dev/null +++ b/ggml/src/ggml-cann/acl_ops.cpp @@ -0,0 +1,80 @@ +#include "acl_ops.h" + +OpCaller::OpCaller() { attrs = aclopCreateAttr(); } + +OpCaller::~OpCaller() { + for (aclTensorDesc* desc : input_descs) { + aclDestroyTensorDesc(desc); + } + for (aclDataBuffer* buffer : input_buffers) { + aclDestroyDataBuffer(buffer); + } + for (aclTensorDesc* desc : output_descs) { + aclDestroyTensorDesc(desc); + } + for (aclDataBuffer* buffer : output_buffers) { + aclDestroyDataBuffer(buffer); + } + aclopDestroyAttr(attrs); +} + +OpCaller& OpCaller::name(std::string _op_name) { + op_name = _op_name; + return *this; +} + +OpCaller& OpCaller::input_no_contiguous(ggml_tensor* tensor, const char* name) { + aclDataType dtype = type_mapping(tensor->type); + // TODO + int64_t ne[] = {tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]}; + aclTensorDesc* tensor_desc = + aclCreateTensorDesc(dtype, GGML_MAX_DIMS, ne, ACL_FORMAT_ND); + aclSetTensorDescName(tensor_desc, name); + input_descs.push_back(tensor_desc); + aclDataBuffer* data_buffer = + aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor)); + input_buffers.push_back(data_buffer); + return *this; +} + +OpCaller& OpCaller::input(ggml_tensor* tensor, const char* name) { + GGML_ASSERT(ggml_is_contiguous(tensor)); + return input_no_contiguous(tensor, name); +} + +OpCaller& OpCaller::output(ggml_tensor* tensor, const char* name) { + aclDataType dtype = type_mapping(tensor->type); + aclTensorDesc* tensor_desc = + aclCreateTensorDesc(dtype, GGML_MAX_DIMS, tensor->ne, ACL_FORMAT_ND); + aclSetTensorDescName(tensor_desc, name); + output_descs.push_back(tensor_desc); + aclDataBuffer* data_buffer = + aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor)); + output_buffers.push_back(data_buffer); + return *this; +} + +OpCaller& OpCaller::attr(int64_t value, const char* name) { + ACL_CHECK(aclopSetAttrInt(attrs, name, value)); + return *this; +} + +OpCaller& OpCaller::attr(bool value, const char* name) { + ACL_CHECK(aclopSetAttrBool(attrs, name, value)); + return *this; +} + +OpCaller& OpCaller::attr(float value, const char* name) { + ACL_CHECK(aclopSetAttrFloat(attrs, name, value)); + return *this; +} + +OpCaller& OpCaller::run(aclrtStream stream) { + ACL_CHECK(aclSetCompileopt(ACL_OP_JIT_COMPILE, "disable")); + ACL_CHECK(aclopCompileAndExecute( + op_name.c_str(), input_descs.size(), input_descs.data(), + input_buffers.data(), output_buffers.size(), output_descs.data(), + output_buffers.data(), attrs, ACL_ENGINE_SYS, ACL_COMPILE_SYS, nullptr, + stream)); + return *this; +} diff --git a/ggml/src/ggml-cann/acl_ops.h b/ggml/src/ggml-cann/acl_ops.h new file mode 100644 index 0000000000000..740105c873afe --- /dev/null +++ b/ggml/src/ggml-cann/acl_ops.h @@ -0,0 +1,72 @@ +#ifndef CANN_ACL_OPS +#define CANN_ACL_OPS + +#include +#include + +#include +#include + +#include "acl_tensor.h" +#include "common.h" + +struct OpCaller { + std::string op_name; + std::vector input_descs; + std::vector input_buffers; + std::vector output_descs; + std::vector output_buffers; + aclopAttr* attrs; + std::vector ptrs; + + OpCaller(); + + virtual ~OpCaller(); + + OpCaller& name(std::string _op_name); + + OpCaller& input_no_contiguous(ggml_tensor* tensor, const char* name); + + OpCaller& input(ggml_tensor* tensor, const char* name); + + OpCaller& output(ggml_tensor* tensor, const char* name); + + OpCaller& attr(int64_t value, const char* name); + + OpCaller& attr(bool value, const char* name); + + OpCaller& attr(float value, const char* name); + + template + OpCaller& input(ggml_backend_cann_context& ctx, ggml_tensor *dst, T* values, + aclDataType dtype, size_t dims, int64_t* dim, + const char* name, aclrtStream stream = nullptr) { + size_t n_elem = 1; + for (size_t i = 0; i < dims; i++) { + n_elem *= dim[i]; + } + + size_t n_bytes = n_elem * sizeof(T); + void* device_ptr = ctx.alloc_buffer(dst, n_bytes); + if (stream == nullptr) { + ACL_CHECK(aclrtMemcpy(device_ptr, n_bytes, values, n_bytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + } else { + ACL_CHECK(aclrtMemcpyAsync(device_ptr, n_bytes, values, n_bytes, + ACL_MEMCPY_HOST_TO_DEVICE, stream)); + } + + aclTensorDesc* tensor_desc = + aclCreateTensorDesc(dtype, dims, dim, ACL_FORMAT_ND); + aclSetTensorDescName(tensor_desc, name); + input_descs.push_back(tensor_desc); + aclDataBuffer* data_buffer = aclCreateDataBuffer(device_ptr, n_bytes); + input_buffers.push_back(data_buffer); + + return *this; + } + + OpCaller& run(aclrtStream stream = nullptr); +}; + +#endif // CANN_ACL_OPS \ No newline at end of file diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp new file mode 100644 index 0000000000000..1b9cc5b351efc --- /dev/null +++ b/ggml/src/ggml-cann/acl_tensor.cpp @@ -0,0 +1,204 @@ +#include "acl_tensor.h" + +#include +#include + +/** + * Mapping ggml_tensor type to acl_tensor type. + */ +aclDataType type_mapping(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return ACL_FLOAT; + case GGML_TYPE_F16: + return ACL_FLOAT16; + case GGML_TYPE_I8: + return ACL_INT8; + case GGML_TYPE_I16: + return ACL_INT16; + case GGML_TYPE_I32: + return ACL_INT32; + default: + return ACL_DT_UNDEFINED; + } + return ACL_DT_UNDEFINED; +} + +static bool nb3_is_valid(const ggml_tensor* tensor) { + // check tensor->nb[3] is contiguous by ne. + if (tensor->nb[3] == tensor->ne[0] * tensor->ne[1] * tensor->ne[2] + * ggml_element_size(tensor)) { + return true; + } + else { + return false; + } +} + +/** + * Transform ggml_tensor to acl_tensor. Note that ggml_tensor dimension order + * is reversed compared to acl_tensor. + * + * If bcast_ne and bcast_nb is nullptr, use ggml_tensor's ne and nb. + * otherwise, use bcast_ne bcast_nb, which means tensor dims should be + * changed to satisfy the broadcast. @sa: get_bcast_shape. + */ +aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne, + size_t* bcast_nb, int64_t bcast_dims, + aclFormat format, size_t offset) { + // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be + // added. + int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2]; + int64_t acl_storage_ne[GGML_MAX_DIMS * 2]; + if (bcast_ne == nullptr) { + for (int i = 0; i < GGML_MAX_DIMS; i++) { + acl_ne[i] = tensor->ne[i]; + // The step size of acl is in elements. + acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor); + acl_storage_ne[i] = acl_ne[i]; + } + if (!nb3_is_valid(tensor)) { + if (tensor->ne[GGML_MAX_DIMS-1] == 1) { + if (tensor->nb[2] == tensor->nb[0]*tensor->ne[0] && + tensor->nb[1] == tensor->nb[2]*tensor->ne[2]) { + // nb[3] not valid, tensor is not contiguous by permuted to + // (0,2,1,3), still use tensor->ne. + // @see https://github.com/ggerganov/llama.cpp/issues/7930. + for (int i = 0; i < GGML_MAX_DIMS; i++) { + acl_storage_ne[i] = acl_ne[i]; + } + } + else { + // nb[3] is valid but tensor is not contiguous. + // e.g. nb=(2,1024,121072,1048576), ne=(32,128,8,1) with + // fp16, 1024/2 not equal to 32. + // acl_storage_ne should be decided by tensor->nb. + for (int i = 0; i < GGML_MAX_DIMS-1; i++) { + acl_storage_ne[i] = std::max(static_cast(1), + acl_stride[i+1] / acl_stride[i]); + } + acl_storage_ne[GGML_MAX_DIMS-1] = + tensor->ne[GGML_MAX_DIMS-1]; + } + } + else { + // not impl + GGML_ASSERT(false); + } + } + } else { + // With bcast + for (int i = 0; i < bcast_dims; i++) { + acl_ne[i] = bcast_ne[i]; + acl_stride[i] = bcast_nb[i] / ggml_element_size(tensor); + acl_storage_ne[i] = acl_ne[i]; + } + } + + int64_t dims = (bcast_dims == 0 ? GGML_MAX_DIMS : bcast_dims); + std::reverse(acl_ne, acl_ne + dims); + std::reverse(acl_stride, acl_stride + dims); + std::reverse(acl_storage_ne, acl_storage_ne + dims); + + aclTensor* acl_tensor = aclCreateTensor( + acl_ne, dims, type_mapping(tensor->type), acl_stride, + offset / ggml_element_size(tensor), format, acl_storage_ne, dims, + tensor->data); + + return acl_tensor; +} + +aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, + size_t type_size, int64_t* ne, size_t* nb, + int64_t dims, aclFormat format, size_t offset) { + int64_t tmp_ne[GGML_MAX_DIMS * 2]; + int64_t tmp_stride[GGML_MAX_DIMS * 2]; + + memcpy(tmp_ne, ne, dims * sizeof(int64_t)); + for (int i = 0; i < dims; i++) { + tmp_stride[i] = nb[i] / type_size; + } + + std::reverse(tmp_ne, tmp_ne + dims); + std::reverse(tmp_stride, tmp_stride + dims); + + aclTensor* acl_tensor = aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, + offset / type_size, format, tmp_ne, + dims, data_ptr); + + return acl_tensor; +} + +/** + * Add extra dims to satisfy acl kernel's broadcast rules (same as numpy). + * ggml_tensor dimension order is reversed compared to Python. + * bcast src1 with src0 though adding a extra dim. + * for example: + * src0 -> (32,10,10,10) + * src1 -> (16,10,10,10) + * bcast_ne_src0 -> (16,2,10,10,10) + * bcast_ne_src1 -> (16,1,10,10,10) + * + * if dim0 has padding. + * a -> (2, 2) padding = 2 + * a: [[1, 2, *, *] + * [2, 3, *, *]] + * nb = (8, 4, 2) + * + * if a should bcast with b -> (2, 4) + * b' -> (2, 2, 2) + * b : [[1, 2, 3, 4, *, *] + * [5, 6, 7, 8, *, *]] + * nb = (12, 6, 1) + * + * after bcast: + * a' -> (2, 1, 2) + * a': [[[1, 2], *, *] + * [[2, 3], *, *]] + * nb = (8, 4, 2, 1) + * + * b' : [[[1, 2], [3, 4], *, *] + * [[5, 6], [7, 8], *, *]] + * nb = (12, 6, 2, 1) + * + * because dim1 in a inserted dim, should add nb for dim1, + * and all other nb moves to next in order. + */ +int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, + int64_t* bcast_ne_src0, int64_t* bcast_ne_src1, + size_t* bcast_nb_src0, size_t* bcast_nb_src1) { + GGML_ASSERT(ggml_can_repeat(src1, src0)); + int bcast_dim_cnt = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + int64_t nr = src0->ne[i] / src1->ne[i]; + bcast_ne_src0[bcast_dim_cnt] = src0->ne[i] / nr; + bcast_ne_src1[bcast_dim_cnt] = src1->ne[i]; + bcast_nb_src0[bcast_dim_cnt] = src0->nb[i]; + bcast_nb_src1[bcast_dim_cnt] = src1->nb[i]; + bcast_dim_cnt++; + if (nr != 1) { + // Need to add an extra dim. + bcast_ne_src0[bcast_dim_cnt] = nr; + bcast_ne_src1[bcast_dim_cnt] = 1; + bcast_nb_src0[bcast_dim_cnt] = bcast_nb_src0[bcast_dim_cnt - 1] * + bcast_ne_src0[bcast_dim_cnt - 1]; + bcast_nb_src1[bcast_dim_cnt] = bcast_nb_src1[bcast_dim_cnt - 1] * + bcast_ne_src1[bcast_dim_cnt - 1]; + bcast_dim_cnt++; + } + } + return bcast_dim_cnt; +} + +/** + * Check if shape are not same, and no dim equals 1. + * if any dim equals 1, acl kernel will do the broadcast. + */ +bool need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) { + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) { + return true; + } + } + return false; +} diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h new file mode 100644 index 0000000000000..9c24c726e7c24 --- /dev/null +++ b/ggml/src/ggml-cann/acl_tensor.h @@ -0,0 +1,38 @@ +#ifndef CANN_ACL_TENSOR_H +#define CANN_ACL_TENSOR_H + +#include + +#include "common.h" + +// Broadcast +aclDataType type_mapping(ggml_type type); + +aclTensor* create_acl_tensor(const ggml_tensor* tensor, + int64_t* bcast_ne = nullptr, + size_t* bcast_nb = nullptr, int64_t bcast_dims = 0, + aclFormat format = ACL_FORMAT_ND, size_t offset = 0); + +aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, + size_t type_size, int64_t* ne, size_t* nb, + int64_t dims, aclFormat format = ACL_FORMAT_ND, size_t offset = 0); + +bool need_bcast(const ggml_tensor* t0, const ggml_tensor* t1); + +int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, + int64_t* bcast_ne_src0, int64_t* bcast_ne_src1, + size_t* bcast_nb_src0, size_t* bcast_nb_src1); + +// Bcast macro to avoid duplicate code. +#define BCAST_SHAPE(src0, src1) \ + int64_t bcast_ne_##src0[GGML_MAX_DIMS * 2]; \ + int64_t bcast_ne_##src1[GGML_MAX_DIMS * 2]; \ + size_t bcast_nb_##src0[GGML_MAX_DIMS * 2]; \ + size_t bcast_nb_##src1[GGML_MAX_DIMS * 2]; \ + int64_t bcast_dims = \ + get_bcast_shape(src0, src1, bcast_ne_##src0, bcast_ne_##src1, \ + bcast_nb_##src0, bcast_nb_##src1); + +#define BCAST_PARAM(src) bcast_ne_##src, bcast_nb_##src, bcast_dims + +#endif // CANN_ACL_TENSOR_H \ No newline at end of file diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp new file mode 100644 index 0000000000000..51594633d5da4 --- /dev/null +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -0,0 +1,2516 @@ +#include "aclnn_ops.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "kernels/ascendc_kernels.h" + +static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, int64_t* repeat_array, + ggml_tensor* bind_tensor) { + // repeat tensor along each dim with repeat_array + + aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + aclrtStream stream = ctx.stream(); + ACL_CHECK(aclnnRepeat(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclDestroyIntArray(repeats)); +} + +void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + GGML_ASSERT(ggml_can_repeat(src, dst)); + + size_t nbytes = ggml_nbytes(dst); + aclrtStream main_stream = ctx.stream(); + // Set dst to a zero tensor. + ACL_CHECK(aclrtMemsetAsync(dst->data, nbytes, 0, nbytes, main_stream)); + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], + dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]}; + + aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray, dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, + aclTensor* acl_src1, aclTensor* acl_dst, + ggml_tensor* bind_tensor) { + // add: dst = acl_src0 + alpha*acl_src1 + + aclScalar* alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyScalar(alpha)); +} + +void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + aclTensor* acl_src0; + aclTensor* acl_src1; + aclTensor* acl_dst; + + // Need bcast + if (!ggml_are_same_shape(src0, src1) && need_bcast(src0, src1)) { + BCAST_SHAPE(src0, src1) + acl_src0 = create_acl_tensor(src0, BCAST_PARAM(src0)); + acl_src1 = create_acl_tensor(src1, BCAST_PARAM(src1)); + acl_dst = create_acl_tensor(dst, BCAST_PARAM(src0)); + } else { + acl_src0 = create_acl_tensor(src0); + acl_src1 = create_acl_tensor(src1); + acl_dst = create_acl_tensor(dst); + } + + aclnn_add(ctx, acl_src0, acl_src1, acl_dst, dst); + + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(acl_src1)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + aclScalar* acl_negative_slope = + aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnLeakyReluGetWorkspaceSize( + acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK( + aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyScalar(acl_negative_slope)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void aclnn_concat(ggml_backend_cann_context& ctx, aclTensorList* tensorList, + aclTensor* acl_dst, int64_t concat_dim, + ggml_tensor* bind_tensor) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + // dims in llama.cpp is reversed. + ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, main_stream)); +} + +void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + aclTensor* acl_src0 = create_acl_tensor(src0); + aclTensor* acl_src1 = create_acl_tensor(src1); + aclTensor* acl_dst = create_acl_tensor(dst); + + int64_t concat_dim = 1; + aclTensor* tensors[] = {acl_src0, acl_src1}; + aclTensorList* tensorList = aclCreateTensorList(tensors, 2); + aclnn_concat(ctx, tensorList, acl_dst, concat_dim, dst); + + ACL_CHECK(aclDestroyTensorList(tensorList)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst, + float start, float stop, float step, int64_t n_elements, + ggml_tensor* bind_tensor) { + // arange: [start, stop), out(i+1) = out(i) + step. + + int64_t steps = (int64_t)std::ceil((stop - start) / step); + GGML_ASSERT(n_elements == steps); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT); + aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT); + aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT); + + ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(aclnnArange(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyScalar(acl_start)); + ACL_CHECK(aclDestroyScalar(acl_end)); + ACL_CHECK(aclDestroyScalar(acl_step)); +} + +void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_dst = create_acl_tensor(dst); + + int64_t n_elements = ggml_nelements(dst); + float start; + float stop; + float step; + memcpy(&start, (float*)dst->op_params + 0, sizeof(float)); + memcpy(&stop, (float*)dst->op_params + 1, sizeof(float)); + memcpy(&step, (float*)dst->op_params + 2, sizeof(float)); + + aclnn_arange(ctx, acl_dst, start, stop, step, n_elements, dst); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + dst->src[1] = dst->src[0]; + ggml_cann_mul_div(ctx, dst); +} + +void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + // y = max(min(x, max_value), min_value). + + ggml_tensor* src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + float min; + float max; + memcpy(&min, dst->op_params, sizeof(float)); + memcpy(&max, (float*)dst->op_params + 1, sizeof(float)); + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT); + aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyScalar(acl_min)); + ACL_CHECK(aclDestroyScalar(acl_max)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + // acl_dst = acl_src * scale. + + ggml_tensor* src = dst->src[0]; + + // scale factor + float v; + memcpy(&v, dst->op_params, sizeof(float)); + + aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT); + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize, + &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyScalar(scale)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0]; + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + void* buffer = ctx.alloc_buffer(dst, ggml_nelements(dst) * sizeof(int64_t)); + aclTensor* tmp_tensor = + create_acl_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, + dst->nb, GGML_MAX_DIMS); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnArgsortGetWorkspaceSize( + acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK( + aclnnArgsort(workspaceAddr, workspaceSize, executor, main_stream)); + + workspaceSize = 0; + ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor, type_mapping(dst->type), + acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(tmp_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + // layer_norm for one layer. + + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + std::vector normData = {dst->ne[0]}; + aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size()); + ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr, + eps, acl_dst, nullptr, nullptr, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream stream = ctx.stream(); + + ACL_CHECK(aclnnLayerNorm(workspaceAddr, workspaceSize, executor, stream)); + + ACL_CHECK(aclDestroyIntArray(norm)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + const float eps = 1e-6f; // TODO: make this a parameter + int n_groups = dst->op_params[0]; + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + int64_t N = src->ne[3]; + int64_t C = src->ne[2]; + int64_t HxW = src->ne[1] * src->ne[0]; + + size_t type_size = ggml_type_size(src->type); + int64_t ne[] = {n_groups, N}; + size_t nb[] = {type_size, type_size * n_groups}; + size_t n_bytes = N * n_groups; + void* buffer = ctx.alloc_buffer(dst, n_bytes * 2); + aclTensor* acl_mean_out = + create_acl_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); + aclTensor* acl_rstd_out = create_acl_tensor( + (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); + + ACL_CHECK(aclnnGroupNormGetWorkspaceSize( + acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst, + acl_mean_out, acl_rstd_out, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream stream = ctx.stream(); + + ACL_CHECK(aclnnGroupNorm(workspaceAddr, workspaceSize, executor, stream)); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyTensor(acl_mean_out)); + ACL_CHECK(aclDestroyTensor(acl_rstd_out)); +} + +void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + // if inplace: dst = dst + alpha * src1 + // else: dst = src0 + alpha * src1 + + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + + size_t nb1 = ((int32_t*)dst->op_params)[0]; + size_t nb2 = ((int32_t*)dst->op_params)[1]; + size_t nb3 = ((int32_t*)dst->op_params)[2]; + size_t offset = ((int32_t*)dst->op_params)[3]; + bool inplace = (bool)((int32_t*)dst->op_params)[4]; + + size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3}; + + aclTensor* acl_dst = create_acl_tensor( + dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); + aclTensor* acl_src1 = create_acl_tensor(src1); + + aclScalar* alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + aclrtStream stream = ctx.stream(); + + if (!inplace) { + size_t cpy_size = ggml_nbytes(dst); + ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, stream)); + aclTensor* acl_src0 = create_acl_tensor( + src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); + ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclDestroyTensor(acl_src0)); + } else { + ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + ACL_CHECK( + aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, stream)); + } + + ACL_CHECK(aclDestroyTensor(acl_src1)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + // reducesum along last dim. + + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = create_acl_tensor(src); + + GGML_ASSERT(dst->ne[0] == 1); + aclTensor* acl_dst = create_acl_tensor(dst); + + int64_t reduce_dims_host[] = {3}; + aclIntArray* reduce_dims = aclCreateIntArray(reduce_dims_host, 1); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnReduceSumGetWorkspaceSize(acl_src, reduce_dims, true, + type_mapping(src->type), acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream stream = ctx.stream(); + ACL_CHECK(aclnnReduceSum(workspaceAddr, workspaceSize, executor, stream)); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = + create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor* acl_dst = + create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + + std::vector output_size{dst->ne[1], dst->ne[0]}; + auto output_size_array = aclCreateIntArray(output_size.data(), 2); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + aclrtStream stream = ctx.stream(); + + ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize( + acl_src, output_size_array, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + ACL_CHECK( + aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor, stream)); + + ACL_CHECK(aclDestroyIntArray(output_size_array)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void aclnn_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst, + aclTensor* acl_src, aclTensor* acl_dst, int64_t* paddings, + float value = 0.0f) { + aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2); + aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnConstantPadNdGetWorkspaceSize( + acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream stream = ctx.stream(); + ACL_CHECK( + aclnnConstantPadNd(workspaceAddr, workspaceSize, executor, stream)); + + ACL_CHECK(aclDestroyIntArray(acl_pad)); + ACL_CHECK(aclDestroyScalar(acl_value)); +} + +void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + // padding: value in the array means how much distance will be padding. + // the position of elements in the array means which dirction to padding, + // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind, + // dim2.front, dim2.behind, dim3.front, dim3.behind] + int64_t paddings[] = { + 0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1], + 0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]}; + aclnn_pad(ctx, dst, acl_src, acl_dst, paddings); + + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyTensor(acl_src)); +} + +void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + const int32_t* opts = (const int32_t*)dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + switch (op) { + case GGML_OP_POOL_AVG: + ggml_cann_avg_pool2d(ctx, dst); + break; + case GGML_OP_POOL_MAX: + ggml_cann_max_pool2d(ctx, dst); + break; + case GGML_OP_POOL_COUNT: + GGML_ASSERT(false); + break; + } +} + +void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = + create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor* acl_dst = + create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + + // params + const int32_t* opts = (const int32_t*)dst->op_params; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + std::vector kernel_dims = {k1, k0}; + std::vector stride_dims = {s1, s0}; + std::vector padding_avg_dims = {p1, p0}; // (padH, padW) + + auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); + auto* strides = aclCreateIntArray(stride_dims.data(), 2); + auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2); + + bool ceil_mode = false; // + bool count_include_pad = true; + int64_t divisor_override = 0; + int8_t cube_math_type = 0; + + // execute op api + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + aclrtStream stream = ctx.stream(); + ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize( + acl_src, kernel_size, strides, paddings_avg, ceil_mode, + count_include_pad, divisor_override, cube_math_type, acl_dst, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + ACL_CHECK(aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, stream)); + + // release + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyIntArray(kernel_size)); + ACL_CHECK(aclDestroyIntArray(strides)); + ACL_CHECK(aclDestroyIntArray(paddings_avg)); +} + +void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = + create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor* acl_dst = + create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + // params + const int32_t* opts = (const int32_t*)dst->op_params; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], + src->ne[3]}; + size_t temp_nb[GGML_MAX_DIMS]; + + temp_nb[0] = ggml_element_size(src); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1]; + } + + void* buffer = + ctx.alloc_buffer(dst, ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]); + aclTensor* tmp_tensor = + create_acl_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, + temp_nb, GGML_MAX_DIMS, ACL_FORMAT_NCHW); + + // pad: see padding in ggml_cann_pad() + int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0}; + float value = -FLT_MAX; + aclnn_pad(ctx, dst, acl_src, tmp_tensor, paddings, value); + + // max_pool + std::vector kernel_dims = {k1, k0}; + std::vector stride_dims = {s1, s0}; + // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end] + std::vector padding_max_dims = {0, 0, 0, 0}; + std::vector dilation_size = {1, 1}; + auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); + auto* strides = aclCreateIntArray(stride_dims.data(), 2); + auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4); + auto* dilations = aclCreateIntArray(dilation_size.data(), 2); + + bool ceil_mode = false; + int64_t auto_pads = 0; + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + aclrtStream stream = ctx.stream(); + + ACL_CHECK(aclnnMaxPoolGetWorkspaceSize( + tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations, + ceil_mode, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + ACL_CHECK(aclnnMaxPool(workspaceAddr, workspaceSize, executor, stream)); + + // release + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyTensor(tmp_tensor)); + ACL_CHECK(aclDestroyIntArray(kernel_size)); + ACL_CHECK(aclDestroyIntArray(strides)); + ACL_CHECK(aclDestroyIntArray(paddings_max)); + ACL_CHECK(aclDestroyIntArray(dilations)); +} + +static void cann_copy(ggml_backend_cann_context& ctx, ggml_tensor* dst, + aclTensor* acl_src, aclTensor* acl_dst) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceCopyGetWorkspaceSize(acl_dst, acl_src, &workspaceSize, + &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream stream = ctx.stream(); + ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, stream)); +} + +void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + // param + dup_param param; + for (int i=0; i<4; i++) { + param.src_ne[i] = src->ne[i]; + param.src_nb[i] = src->nb[i]; + param.dst_ne[i] = dst->ne[i]; + param.dst_nb[i] = dst->nb[i]; + } + + // TODO: simplefify + if (src->type==GGML_TYPE_F16) { + if (dst->type==GGML_TYPE_Q8_0) { + aclrtlaunch_ascendc_quantize_f16_q8_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } + if (dst->type==GGML_TYPE_F16) { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, dst, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + if (ggml_is_contiguous(dst)) { + const size_t src_type_size = ggml_type_size(src->type); + if (src->nb[0] == src_type_size) { + // src0 is contigous on first dimension, copy by rows + int64_t rows_num = ggml_nrows(src); + + // param copy + void *param_buffer; + ACL_CHECK(aclrtMalloc(¶m_buffer, sizeof(dup_param), + ACL_MEM_MALLOC_HUGE_FIRST)); + + ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), + ¶m, sizeof(dup_param), + ACL_MEMCPY_HOST_TO_DEVICE)); + aclrtlaunch_ascendc_dup_by_rows_fp16(rows_num, ctx.stream(), + src->data, dst->data, + param_buffer); + return; + } + GGML_ASSERT(false); + } + GGML_ASSERT(false); + } + if (dst->type==GGML_TYPE_F32) { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, dst, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + if (ggml_is_contiguous(dst)) { + const size_t src_type_size = ggml_type_size(src->type); + if (src->nb[0] == src_type_size) { + // src0 is contigous on first dimension, copy by rows + int64_t rows_num = ggml_nrows(src); + // param copy + void *param_buffer; + ACL_CHECK(aclrtMalloc(¶m_buffer, sizeof(dup_param), + ACL_MEM_MALLOC_HUGE_FIRST)); + + ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), + ¶m, sizeof(dup_param), + ACL_MEMCPY_HOST_TO_DEVICE)); + aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(rows_num, + ctx.stream(), + src->data, + dst->data, + param_buffer); + return; + } + GGML_ASSERT(false); + } + GGML_ASSERT(false); + } + // TODO + GGML_ASSERT(false); + } + else if (src->type==GGML_TYPE_F32) { + //TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size + // && nb0 == type_size) + if (dst->type==GGML_TYPE_Q8_0) { + aclrtlaunch_ascendc_quantize_f32_q8_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } + if (dst->type==GGML_TYPE_F32) { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, dst, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + if (ggml_is_contiguous(dst)) { + const size_t src_type_size = ggml_type_size(src->type); + if (src->nb[0] == src_type_size) { + // src0 is contigous on first dimension, copy by rows + int64_t rows_num = ggml_nrows(src); + // param copy + void *param_buffer; + ACL_CHECK(aclrtMalloc(¶m_buffer, sizeof(dup_param), + ACL_MEM_MALLOC_HUGE_FIRST)); + + ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), + ¶m, sizeof(dup_param), + ACL_MEMCPY_HOST_TO_DEVICE)); + aclrtlaunch_ascendc_dup_by_rows_fp32(rows_num, ctx.stream(), + src->data, dst->data, + param_buffer); + return; + } + GGML_ASSERT(false); + } + else { + //TODO: dst not contiguous + GGML_ASSERT(false); + } + } + if (dst->type==GGML_TYPE_F16) { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, dst, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + if (ggml_is_contiguous(dst)) { + const size_t src_type_size = ggml_type_size(src->type); + if (src->nb[0] == src_type_size) { + // src0 is contigous on first dimension, copy by rows + int64_t rows_num = ggml_nrows(src); + // param copy + void *param_buffer; + ACL_CHECK(aclrtMalloc(¶m_buffer, sizeof(dup_param), + ACL_MEM_MALLOC_HUGE_FIRST)); + + ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(dup_param), + ¶m, sizeof(dup_param), + ACL_MEMCPY_HOST_TO_DEVICE)); + aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(rows_num, + ctx.stream(), + src->data, + dst->data, + param_buffer); + return; + } + GGML_ASSERT(false); + } + } + // TODO + GGML_ASSERT(false); + } + else { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, dst, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + GGML_ASSERT(false); + } +} + +#ifdef __cplusplus +extern "C" { +#endif +aclnnStatus aclnnRmsNormGetWorkspaceSize(const aclTensor* x, + const aclTensor* gamma, double epsilon, + const aclTensor* yOut, + const aclTensor* rstdOout, + uint64_t* workspaceSize, + aclOpExecutor** executor); +aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize, + aclOpExecutor* executor, aclrtStream stream); +#ifdef __cplusplus +} +#endif + +static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, ggml_tensor* dst, + int64_t* ne, int64_t dims, aclDataType type, + size_t type_size) { + int64_t elements = 1; + for (int i = 0; i < dims; i++) { + elements *= ne[i]; + } + size_t n_bytes = elements * type_size; + + size_t nb[GGML_MAX_DIMS]; + nb[0] = type_size; + for (int i = 1; i < dims; i++) { + nb[i] = nb[i - 1] * ne[i - 1]; + } + + void* buffer = ctx.alloc_buffer(dst, n_bytes); + ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream())); + aclTensor* zero = create_acl_tensor(buffer, type, type_size, ne, nb, dims); + return zero; +} + +static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, ggml_tensor* dst, + int64_t* ne, int64_t dims, aclDataType type, + size_t type_size, float value = 1.0f) { + aclTensor* acl_tensor = aclnn_zero(ctx, dst, ne, dims, type, type_size); + float alpha_host = 1.0f; + aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT); + aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceAddsGetWorkspaceSize(acl_tensor, other, alpha, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + ACL_CHECK( + aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream())); + + return acl_tensor; +} + +void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + GGML_ASSERT(eps > 0.0f); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + aclTensor* acl_gamma = aclnn_ones( + ctx, dst, src->ne, 1, type_mapping(src->type), ggml_element_size(src)); + + int64_t rstd_ne[] = {1, src->ne[1], src->ne[2], src->ne[3]}; + aclTensor* acl_rstd = + aclnn_zero(ctx, dst, rstd_ne, GGML_MAX_DIMS, type_mapping(src->type), + ggml_element_size(src)); + + ACL_CHECK(aclnnRmsNormGetWorkspaceSize( + acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + ACL_CHECK( + aclnnRmsNorm(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyTensor(acl_gamma)); + ACL_CHECK(aclDestroyTensor(acl_rstd)); +} + +// TODO: performace is low. +void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, + float value) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + const int n_past = ((int32_t*)dst->op_params)[0]; + + aclTensor* mask_tensor = + aclnn_ones(ctx, dst, src->ne, GGML_MAX_DIMS, type_mapping(src->type), + ggml_element_size(src), value); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + ACL_CHECK( + aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream())); + + aclScalar* alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + + ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + ACL_CHECK( + aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyScalar(alpha)); + ACL_CHECK(aclDestroyTensor(mask_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, aclDataType cast_data_type, + ggml_tensor* bind_tensor) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + aclrtStream stream = ctx.stream(); + + ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, stream)); +} + +static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, int64_t* new_dim, uint64_t dims, + ggml_tensor* bind_tensor) { + aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK( + aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyIntArray(acl_dims)); +} + +#ifdef __cplusplus +extern "C" { +#endif +aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self, + const aclIntArray* kernelSize, + const aclIntArray* dilation, + const aclIntArray* padding, + const aclIntArray* stride, + aclTensor* out, uint64_t* workspaceSize, + aclOpExecutor** executor); +aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize, + aclOpExecutor* executor, aclrtStream stream); +#ifdef __cplusplus +} +#endif +void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; // kernel + ggml_tensor* src1 = dst->src[1]; // input + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int64_t N = is_2D ? ne13 : ne12; + const int64_t IC = is_2D ? ne12 : ne11; + + const int64_t KH = is_2D ? ne01 : 1; + const int64_t KW = ne00; + + const int64_t OH = is_2D ? ne2 : 1; + const int64_t OW = ne1; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH] + aclTensor* acl_src1 = create_acl_tensor(src1); + int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N}; + size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; + + tmp_im2col_nb[0] = ggml_type_size(src1->type); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1]; + } + + // Calculate im2col. + // If dst is f16, tmp_buffer is f32, we need alloc src.typesize * + // dst.elemcount. + void* tmp_im2col_buffer = + ctx.alloc_buffer(dst, ggml_nelements(dst) * ggml_element_size(src1)); + aclTensor* tmp_im2col_tensor = create_acl_tensor( + tmp_im2col_buffer, type_mapping(src1->type), ggml_type_size(src1->type), + tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + + std::vector kernel_dims = {KH, KW}; + std::vector dilation_size = {d1, d0}; + std::vector padding_dims = {p1, p0}; + std::vector stride_dims = {s1, s0}; + auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); + auto* dilations = aclCreateIntArray(dilation_size.data(), 2); + auto* paddings = aclCreateIntArray(padding_dims.data(), 2); + auto* strides = aclCreateIntArray(stride_dims.data(), 2); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + aclrtStream stream = ctx.stream(); + + ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations, + paddings, strides, tmp_im2col_tensor, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + ACL_CHECK(aclnnIm2col(workspaceAddr, workspaceSize, executor, stream)); + + // Cast if dst is f16. + aclTensor* tmp_cast_tensor = nullptr; + if (src1->type != dst->type) { + void* tmp_cast_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst)); + size_t temp_cast_nb[GGML_MAX_DIMS - 1]; + temp_cast_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1]; + } + + tmp_cast_tensor = create_acl_tensor( + tmp_cast_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, + type_mapping(dst->type), dst); + } + + // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] + int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; + size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; + aclTensor* acl_dst = + create_acl_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3, dst); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3, dst); + } + + // release + ACL_CHECK(aclDestroyTensor(acl_src1)); + ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_cast_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyIntArray(kernel_size)); + ACL_CHECK(aclDestroyIntArray(dilations)); + ACL_CHECK(aclDestroyIntArray(paddings)); + ACL_CHECK(aclDestroyIntArray(strides)); +} + +static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src, + ggml_tensor* bind_tensor) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK( + aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK( + aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, + float scale, aclTensor* acl_dst, bool inplace, + ggml_tensor* bind_tensor) { + aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + if (inplace) { + ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor, + ctx.stream())); + } + else { + ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, + ctx.stream())); + } + + + ACL_CHECK(aclDestroyScalar(acl_scale)); +} + +static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_other, ggml_tensor* bind_tensor) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK( + aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +static void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_other, aclTensor* acl_dst, + ggml_tensor* bind_tensor) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, ggml_tensor* bind_tensor) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK( + aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, ggml_tensor* bind_tensor) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK( + aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { + const ggml_tensor* src = dst->src[0]; + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + int half = dim / 2; + + aclTensor* acl_src = create_acl_tensor(src); + + // arange: [0, ..., half) + float start = 0; + float stop = half; + float step = 1; + int64_t n_elements_arange = half; + int64_t tmp_arange_ne[] = {half}; + size_t tmp_arange_nb[] = {sizeof(dst->type)}; + + void* tmp_arange_buffer = ctx.alloc_buffer(dst, half * sizeof(dst->type)); + aclTensor* tmp_arange_tensor = create_acl_tensor( + tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_arange_ne, tmp_arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + + aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange, + dst); + + // freq + float freq_param = -logf(max_period) / half; + bool inplace = true; + aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace, dst); + aclnn_exp(ctx, tmp_arange_tensor, dst); + + // permute: src [0,1,2,3]->[0,1,3,2] + int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]}; + size_t tmp_permute_nb[GGML_MAX_DIMS]; + tmp_permute_nb[0] = ggml_type_size(src->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; + } + + void* tmp_permute_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src)); + aclTensor* tmp_permute_tenosr = create_acl_tensor( + tmp_permute_buffer, type_mapping(src->type), ggml_type_size(src->type), + tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + int64_t permute_dim[] = {0, 1, 3, 2}; + int64_t num_dims = 4; + aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims, dst); + + // timestep * freq + int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2], + src->ne[3]}; + size_t tmp_mul_nb[GGML_MAX_DIMS]; + tmp_mul_nb[0] = ggml_type_size(src->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1]; + } + + int mul_nelements = + src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3]; + + void* tmp_mul_buffer = + ctx.alloc_buffer(dst, mul_nelements * ggml_type_size(src->type)); + aclTensor* tmp_mul_tensor = create_acl_tensor( + tmp_mul_buffer, type_mapping(src->type), ggml_type_size(src->type), + tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + aclnn_noinplcace_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, + tmp_mul_tensor, dst); + + // cos + void* tmp_cos_buffer = + ctx.alloc_buffer(dst, mul_nelements * ggml_type_size(src->type)); + aclTensor* tmp_cos_tensor = create_acl_tensor( + tmp_cos_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + + aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor, dst); + + // sin + void* tmp_sin_buffer = + ctx.alloc_buffer(dst, mul_nelements * ggml_type_size(src->type)); + aclTensor* tmp_sin_tensor = create_acl_tensor( + tmp_sin_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + + aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor, dst); + + // concat + int64_t concat_dim = 3; + aclTensor* acl_dst = create_acl_tensor(dst); + aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor}; + aclTensorList* tensorList = aclCreateTensorList(tensors, 2); + aclnn_concat(ctx, tensorList, acl_dst, concat_dim, dst); + + // release + // segmentation fault when delete both tensorList and his elements. + ACL_CHECK(aclDestroyTensorList(tensorList)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(tmp_arange_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr)); + ACL_CHECK(aclDestroyTensor(tmp_mul_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, + aclTensor* acl_dst, ggml_tensor* bind_tensor) { + // fill acl_dst with scalar value. + + auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize( + acl_dst, acl_scalar, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor, + ctx.stream())); + ACL_CHECK(aclDestroyScalar(acl_scalar)); +} + +static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, + aclTensor* acl_dst, aclTensor* acl_exp, + ggml_tensor* bind_tensor) { + // acl_dst = acl_dst^acl_exp + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize( + acl_dst, acl_exp, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize, + executor, ctx.stream())); +} + +static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_position, aclTensor* acl_dst, const int n_head, + int64_t* src_ne, const size_t src_nb0, float max_bias, + ggml_tensor* dst) { + GGML_UNUSED(src_ne[1]); + const int64_t ne2_ne3 = src_ne[2] * src_ne[3]; + GGML_ASSERT(src_nb0 == sizeof(float)); + GGML_ASSERT(n_head == src_ne[2]); + + const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head)); + + float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + // init arange + void* tmp_arange_buffer = + ctx.alloc_buffer(dst, ne2_ne3 * ggml_type_size(dst->type)); + size_t memset_size = ne2_ne3 * ggml_type_size(dst->type); + ACL_CHECK(aclrtMemset(tmp_arange_buffer, memset_size, 0, memset_size)); + + // arange1: [1, ..., n_heads_log2_floor+1) + float start = 1; + float stop = n_heads_log2_floor + 1; + float step = 1; + int64_t n_elements_arange = n_heads_log2_floor; + + int64_t tmp_arange1_ne[] = {n_heads_log2_floor}; + size_t tmp_arange1_nb[] = {sizeof(dst->type)}; + aclTensor* tmp_arange1_tensor = create_acl_tensor( + tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_arange1_ne, tmp_arange1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + + aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange, + dst); + + aclTensor* tmp_arange2_tensor = nullptr; + if (n_heads_log2_floor < ne2_ne3) { + // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1) + start = 1; + stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1; + step = 2; + n_elements_arange = ne2_ne3 - n_heads_log2_floor; + int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor}; + size_t tmp_arange2_nb[] = {sizeof(dst->type)}; + + aclTensor* tmp_arange2_tensor = create_acl_tensor( + (char*)tmp_arange_buffer + n_heads_log2_floor * ggml_type_size(dst->type), + type_mapping(dst->type), ggml_type_size(dst->type), tmp_arange2_ne, + tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step, + n_elements_arange, dst); + } + + // init mk_base + void* tmp_mk_base_buffer = + ctx.alloc_buffer(dst, ne2_ne3 * ggml_type_size(dst->type)); + int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor}; + size_t tmp_mk_base1_nb[] = {sizeof(dst->type)}; + aclTensor* tmp_mk_base1_tensor = create_acl_tensor( + tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_mk_base1_ne, tmp_mk_base1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + + aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor, dst); + + aclTensor* tmp_mk_base2_tensor = nullptr; + if (n_heads_log2_floor < ne2_ne3) { + int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor}; + size_t tmp_mk_base2_nb[] = {sizeof(dst->type)}; + aclTensor* tmp_mk_base2_tensor = create_acl_tensor( + (char*)tmp_mk_base_buffer + n_heads_log2_floor * ggml_type_size(dst->type), + type_mapping(dst->type), ggml_type_size(dst->type), tmp_mk_base2_ne, + tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor, dst); + } + + // init mk + int64_t tmp_mk_base_ne[] = {ne2_ne3}; + size_t tmp_mk_base_nb[] = {sizeof(dst->type)}; + aclTensor* tmp_mk_base_tensor = create_acl_tensor( + tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_mk_base_ne, tmp_mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclTensor* tmp_arange_tensor = create_acl_tensor( + tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_mk_base_ne, tmp_mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor, dst); + + // reshape mk + int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]}; + size_t tmp_mk_nb[GGML_MAX_DIMS]; + tmp_mk_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1]; + } + aclTensor* tmp_mk_tensor = create_acl_tensor( + tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + + // acl_position * mk + int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]}; + size_t tmp_output_nb[GGML_MAX_DIMS]; + tmp_output_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1]; + } + void* tmp_output_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst)); + aclTensor* tmp_output_tensor = create_acl_tensor( + tmp_output_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + aclnn_noinplcace_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor, + dst); + + // add + aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst, dst); + + ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_arange_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_mk_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_output_tensor)); +} + +void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + const int n_head = ((int32_t*)dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t*)dst->op_params + 2, sizeof(float)); + + const int64_t ne0 = src->ne[0]; // all_seq_len = n_past + ne1 + const int64_t ne2 = src->ne[2]; // n_head -> this is k + const size_t nb0 = src->nb[0]; + + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(n_head == ne2); + + // position arange: [0, ..., ne0) + float start = 0; + float stop = ne0; + float step = 1; + int64_t n_elements_arange = ne0; + int64_t tmp_position_ne[] = {ne0, 1, 1, 1}; + size_t tmp_position_nb[] = {sizeof(dst->type)}; + + void* tmp_position_buffer = ctx.alloc_buffer(dst, ne0 * sizeof(dst->type)); + aclTensor* tmp_position_tensor = create_acl_tensor( + tmp_position_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + tmp_position_ne, tmp_position_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + + aclnn_arange(ctx, tmp_position_tensor, start, stop, step, n_elements_arange, + dst); + + // call alibi + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + aclnn_alibi(ctx, acl_src, tmp_position_tensor, acl_dst, n_head, src->ne, nb0, max_bias, dst); + + ACL_CHECK(aclDestroyTensor(tmp_position_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_cann_dup(ctx, dst); +} + +static void aclnn_inplace_add(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, ggml_tensor* bind_tensor) { + aclScalar* alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK( + aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyScalar(alpha)); +} + +static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src, + int64_t dim, aclTensor* acl_dst, + ggml_tensor* bind_tensor) { + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(acl_src, dim, acl_dst, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + aclrtStream stream = ctx.stream(); + ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream)); +} + +void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; // mask + + aclTensor* acl_src0 = create_acl_tensor(src0); + aclTensor* acl_dst = create_acl_tensor(dst); + + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (float*)dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float)); + + // input mul scale + aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); + + size_t n_bytes = ggml_nbytes(src0); + void* input_mul_scale_buffer = ctx.alloc_buffer(dst, n_bytes); + aclTensor* acl_input_mul_scale_tensor = create_acl_tensor( + input_mul_scale_buffer, + ACL_FLOAT, + ggml_type_size(src0->type), + src0->ne, src0->nb, + GGML_MAX_DIMS); + + bool inplace = false; + aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace, + dst); + + // mask + aclTensor* acl_src1_fp32_tensor = nullptr; + aclTensor* tmp_permute_tenosr = nullptr; + if (src1) { + const bool use_f16 = src1->type == GGML_TYPE_F16; + if (use_f16) { + // cast to fp32 + size_t n_bytes = ggml_nelements(src1) * sizeof(float_t); + size_t src1_fp32_nb[GGML_MAX_DIMS]; + src1_fp32_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1]; + } + void* src1_fp32_buffer = ctx.alloc_buffer(dst, n_bytes); + acl_src1_fp32_tensor = create_acl_tensor(src1_fp32_buffer, + ACL_FLOAT, + sizeof(float), + src1->ne, + src1_fp32_nb, + GGML_MAX_DIMS); + aclTensor* acl_src1 = create_acl_tensor(src1); + aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT, dst); + + ACL_CHECK(aclDestroyTensor(acl_src1)); + + } + else { + acl_src1_fp32_tensor = create_acl_tensor(src1); + } + + // broadcast the mask across rows + if (src1->ne[1] != src0->ne[1]) { + // mask shape: [1,1,a,b] + int64_t tmp_permute_ne[] = {src1->ne[0], src1->ne[2], src1->ne[1], src1->ne[3]}; + size_t tmp_permute_nb[GGML_MAX_DIMS]; + tmp_permute_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; + } + + void* tmp_permute_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src1)); + tmp_permute_tenosr = create_acl_tensor( + tmp_permute_buffer, ACL_FLOAT, sizeof(float), + tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + int64_t permute_dim[] = {0, 2, 1, 3}; + int64_t num_dims = 4; + aclnn_permute(ctx, acl_src1_fp32_tensor, tmp_permute_tenosr, permute_dim, num_dims, dst); + } + + // alibi + const int n_head = src0->ne[2]; + const size_t src_nb0 = src0->nb[0]; + + n_bytes = ggml_nbytes(dst); + void* output_buffer = ctx.alloc_buffer(dst, n_bytes); + aclTensor* alibi_output_tensor = create_acl_tensor( + output_buffer, + ACL_FLOAT, + ggml_type_size(dst->type), + dst->ne, dst->nb, + GGML_MAX_DIMS); + if (max_bias <=0.0f) { + // slope = 1.0 + if (tmp_permute_tenosr) { + aclnn_add(ctx, tmp_permute_tenosr, acl_input_mul_scale_tensor, + alibi_output_tensor, dst); + } + else { + aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor, + alibi_output_tensor, dst); + } + + } + else { + // slope != 1.0 + if (tmp_permute_tenosr) { + aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_permute_tenosr, + alibi_output_tensor, n_head, src0->ne, src_nb0, max_bias, + dst); + } + else { + aclnn_alibi(ctx, acl_input_mul_scale_tensor, acl_src1_fp32_tensor, + alibi_output_tensor, n_head, src0->ne, src_nb0, max_bias, + dst); + } + } + + // softmax + aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst, dst); + ACL_CHECK(aclDestroyTensor(alibi_output_tensor)); + } + else { + aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst, dst); + } + + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyScalar(acl_scale)); + ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr)); + +} + +void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + + switch (src0->type) { + case GGML_TYPE_F32: + aclrtlaunch_ascendc_get_row_f32( + 24, ctx.stream(), src0->data, src1->data, dst->data, + ((ggml_tensor*)src0->extra)->ne, + ((ggml_tensor*)src0->extra)->nb, + ((ggml_tensor*)src1->extra)->ne, + ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + break; + case GGML_TYPE_F16: + aclrtlaunch_ascendc_get_row_f16( + 24, ctx.stream(), src0->data, src1->data, dst->data, + ((ggml_tensor*)src0->extra)->ne, + ((ggml_tensor*)src0->extra)->nb, + ((ggml_tensor*)src1->extra)->ne, + ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + break; + case GGML_TYPE_Q4_0: + aclrtlaunch_ascendc_get_row_q4_0( + 24, ctx.stream(), src0->data, src1->data, dst->data, + ((ggml_tensor*)src0->extra)->ne, + ((ggml_tensor*)src1->extra)->ne, + ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + break; + case GGML_TYPE_Q8_0: + aclrtlaunch_ascendc_get_row_q8_0( + 24, ctx.stream(), src0->data, src1->data, dst->data, + ((ggml_tensor*)src0->extra)->ne, + ((ggml_tensor*)src1->extra)->ne, + ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + break; + default: + GGML_ASSERT(false); + break; + } +} + +static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, int64_t dim, int64_t repeats, + int64_t output_size, ggml_tensor* bind_tensor) { + // each elem in acl_src will repeat. repeat number is `repeats`, repeats dim + // is `dim`. + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(acl_src, repeats, + dim, output_size, + acl_dst, + &workspaceSize, + &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK( + aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize, executor, + main_stream)); + +} + +static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, + aclTensor* acl_weight, aclTensor* acl_dst, + ggml_tensor* bind_tensor) { + int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is fp32, + // atlas a2 will transpose it to HFLOAT32. + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst, + cube_math_type, &workspaceSize, + &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(aclnnMatmul(workspaceAddr, workspaceSize, executor, + main_stream)); +} + +static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; // weight + ggml_tensor* src1 = dst->src[1]; // input + + // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto broadcast, + // when weight ne2 or ne3 is not 1, weight need repeat. + int repeat_dim2 = 1; + if (src0->ne[2] != src1->ne[2] && src0->ne[2] !=1) { + repeat_dim2 = src1->ne[2] / src0->ne[2]; + } + int repeat_dim3 = 1; + if (src0->ne[3] != src1->ne[3] && src0->ne[3] !=1) { + repeat_dim3 = src1->ne[3] / src0->ne[3]; + } + + int64_t weight_repeat_ne[] = {src0->ne[0], src0->ne[1], + src0->ne[2]*repeat_dim2, + src0->ne[3]*repeat_dim3}; + size_t weight_repeat_nb[GGML_MAX_DIMS]; + weight_repeat_nb[0] = src0->nb[0]; + for (int i = 1; i < GGML_MAX_DIMS; i++) { + weight_repeat_nb[i] = weight_repeat_nb[i - 1] * weight_repeat_ne[i - 1]; + }; + + void* acl_repeat_weight_buffer = nullptr; + aclTensor* acl_repeat_weight_tensor = nullptr; + aclTensor* acl_weight_tensor =create_acl_tensor(src0); + if (repeat_dim2 > 1) { + weight_repeat_ne[3] = src0->ne[3]; + acl_repeat_weight_buffer = ctx.alloc_buffer(dst, + ggml_nelements(src0) + *repeat_dim2 + *ggml_type_size(src0->type)); + acl_repeat_weight_tensor = create_acl_tensor(acl_repeat_weight_buffer, + type_mapping(src0->type), + ggml_type_size(src0->type), + weight_repeat_ne, + weight_repeat_nb, + GGML_MAX_DIMS); + + int64_t dim = 1; + int64_t output_size = src0->ne[2]*repeat_dim2; + aclnn_repeat_interleave(ctx, acl_weight_tensor, + acl_repeat_weight_tensor, dim, repeat_dim2, + output_size, dst); + } + if (repeat_dim3 > 1) { + weight_repeat_ne[3] = src0->ne[3]*repeat_dim3; + acl_repeat_weight_buffer = ctx.alloc_buffer(dst, + ggml_nelements(src0) + *repeat_dim2*repeat_dim3 + *ggml_type_size(src0->type)); + aclTensor* acl_repeat_weight_tensor2 = create_acl_tensor( + acl_repeat_weight_buffer, + type_mapping(src0->type), + ggml_type_size(src0->type), + weight_repeat_ne, + weight_repeat_nb, + GGML_MAX_DIMS); + int64_t dim = 0; + int64_t output_size = src0->ne[3]*repeat_dim3; + if (acl_repeat_weight_tensor==nullptr) { + aclnn_repeat_interleave(ctx, acl_weight_tensor, + acl_repeat_weight_tensor2, dim, repeat_dim3, + output_size, dst); + } + else { + aclnn_repeat_interleave(ctx, acl_repeat_weight_tensor, + acl_repeat_weight_tensor2, dim, repeat_dim3, + output_size, dst); + } + ACL_CHECK(aclDestroyTensor(acl_repeat_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_repeat_weight_tensor2)); + } + if (acl_repeat_weight_buffer==nullptr) { + acl_repeat_weight_buffer = src0->data; + } + + // transpose weight: [1,2,3,4] -> [1,2,4,3] + int64_t weight_ne[] = {weight_repeat_ne[1], weight_repeat_ne[0], + weight_repeat_ne[2], weight_repeat_ne[3]}; + size_t weight_nb[] = {weight_repeat_nb[1], weight_repeat_nb[0], + weight_repeat_nb[2], weight_repeat_nb[3]}; + + aclTensor* acl_weight_transpose_tensor = create_acl_tensor( + acl_repeat_weight_buffer, + type_mapping(src0->type), + ggml_type_size(src0->type), + weight_ne, + weight_nb, + GGML_MAX_DIMS); + + // mul_mat + aclTensor* acl_input_tensor = create_acl_tensor(src1); + aclTensor* acl_dst = create_acl_tensor(dst); + aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_transpose_tensor, acl_dst, + dst); + + ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_weight_transpose_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; // weight + ggml_tensor* src1 = dst->src[1]; // input + + // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC + // is regarded as batch. weight need transpose. + int64_t weight_ne[] = {src0->ne[1], src0->ne[0]}; + size_t weight_elem_size = sizeof(uint8_t); + size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; + // size of one matrix is element_size * height * width. + size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1]; + size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; + + // scale stored at the end of weight. Also need transpose. + int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0}; + size_t scale_elem_size = sizeof(uint16_t); + size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, + scale_elem_size}; + size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0; + char* scale_offset = (char*)src0->data + weight_size; + + // input + void* input_buffer; + size_t input_elem_size = sizeof(uint16_t); + int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; + size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; + size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; + + if (src1->type != GGML_TYPE_F16) { + aclTensor* acl_src1_tensor = create_acl_tensor(src1); + input_buffer = + ctx.alloc_buffer(dst, ggml_nelements(src1) * input_elem_size); + + int64_t* input_cast_ne = src1->ne; + size_t input_cast_nb[GGML_MAX_DIMS]; + input_cast_nb[0] = sizeof(uint16_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1]; + } + + aclTensor* acl_input_tensor = + create_acl_tensor(input_buffer, ACL_FLOAT16, input_elem_size, + input_cast_ne, input_cast_nb, GGML_MAX_DIMS); + aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16, dst); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); + } else { + input_buffer = src1->data; + } + + // output + size_t output_elem_size = sizeof(uint16_t); + int64_t output_ne[] = {dst->ne[0], dst->ne[1]}; + size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]}; + void* output_buffer = + ctx.alloc_buffer(dst, ggml_nelements(dst) * output_elem_size); + size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1]; + + // aclnn + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { + for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { + int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]); + int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]); + + int64_t batch1 = n1 * src1->ne[2] + c1; + int64_t batch0 = n0 * src0->ne[2] + c0; + + aclTensor* acl_input_tensor = create_acl_tensor( + (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, + input_elem_size, input_ne, input_nb, 2); + aclTensor* acl_weight_tensor = create_acl_tensor( + (char*)src0->data + batch0 * weight_stride, ACL_INT8, + weight_elem_size, weight_ne, weight_nb, 2); + aclTensor* acl_scale_tensor = create_acl_tensor( + scale_offset + batch0 * scale_stride, ACL_FLOAT16, + scale_elem_size, scale_ne, scale_nb, 2); + aclTensor* acl_output_tensor = create_acl_tensor( + (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2); + + ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( + acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, + nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, + &workspaceSize, &executor)); + + if (workspaceSize > 0 && workspaceAddr == nullptr) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + ACL_CHECK(aclnnWeightQuantBatchMatmulV2( + workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + } + } + + // cast out + int64_t* output_cast_ne = dst->ne; + size_t output_cast_nb[GGML_MAX_DIMS]; + output_cast_nb[0] = sizeof(uint16_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; + } + + aclTensor* acl_output_tensor = + create_acl_tensor(output_buffer, ACL_FLOAT16, output_elem_size, + output_cast_ne, output_cast_nb, GGML_MAX_DIMS); + aclTensor* acl_dst_tensor = create_acl_tensor(dst); + aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT, dst); + + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); +} + +void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + const enum ggml_type type = dst->src[0]->type; + switch (type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + ggml_cann_mat_mul_fp(ctx, dst); + break; + // case GGML_TYPE_Q4_0: + // ggml_cann_mul_mat_q4_0(ctx, dst); + // break; + case GGML_TYPE_Q8_0: + ggml_cann_mul_mat_q8_0(ctx, dst); + break; + default: + GGML_ASSERT(false); + break; + } +} + +static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, int64_t* shifts, int64_t* dims, + ggml_tensor* bind_tensor) { + + aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1); + aclIntArray* acl_dims = aclCreateIntArray(dims, 1); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK( + aclnnRoll(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyIntArray(acl_shifts)); + ACL_CHECK(aclDestroyIntArray(acl_dims)); +} + +void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; // input + ggml_tensor* src1 = dst->src[1]; // position + + // param init + rope_param param; + for (int i=0; i<4; i++) { + param.input_ne[i] = src0->ne[i]; + param.position_ne[i] = src1->ne[i]; + } + + const int mode = ((int32_t *) dst->op_params)[2]; + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; + param.is_neox = is_neox; + param.is_glm = is_glm; + + memcpy(¶m.freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(¶m.freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(¶m.ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(¶m.attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(¶m.beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(¶m.beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + param.n_dims = ((int32_t *) dst->op_params)[1]; + param.n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + const float theta_scale = powf(param.freq_base, -2.0f/param.n_dims); + param.theta_scale = theta_scale; + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(param.n_dims, param.n_orig_ctx, param.freq_base, + param.beta_fast, param.beta_slow, corr_dims); + param.corr_dims[0] = corr_dims[0]; + param.corr_dims[1] = corr_dims[1]; + + // param copy + void *param_buffer; + ACL_CHECK(aclrtMalloc(¶m_buffer, sizeof(rope_param), + ACL_MEM_MALLOC_HUGE_FIRST)); + + ACL_CHECK(aclrtMemcpy(param_buffer, sizeof(rope_param), ¶m, + sizeof(rope_param), ACL_MEMCPY_HOST_TO_DEVICE)); + + // cast position: i32 to fp32 + aclTensor* acl_position_tensor = create_acl_tensor(src1); + void* position_cast_buffer = ctx.alloc_buffer(dst, ggml_nelements(src1) + * sizeof(float_t)); + int64_t* position_cast_ne = src1->ne; + size_t position_cast_nb[GGML_MAX_DIMS]; + position_cast_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + position_cast_nb[i] = position_cast_nb[i - 1] * position_cast_ne[i - 1]; + } + aclTensor* acl_postion_cast_tensor = create_acl_tensor(position_cast_buffer, + ACL_FLOAT, + sizeof(float_t), + position_cast_ne, + position_cast_nb, + GGML_MAX_DIMS); + + aclnn_cast(ctx, acl_position_tensor, acl_postion_cast_tensor, ACL_FLOAT, + dst); + + // init cos/sin cache, + void* sin_buffer = ctx.alloc_buffer(dst, src0->ne[0] * src0->ne[2] + * sizeof(float_t)); + void* cos_buffer = ctx.alloc_buffer(dst, src0->ne[0] * src0->ne[2] + * sizeof(float_t)); + + + aclrtlaunch_ascendc_rope_init_cache(param.position_ne[0], ctx.stream(), + position_cast_buffer, + sin_buffer, cos_buffer, + param_buffer, + ((ggml_tensor*)src0->extra)->ne); + ACL_CHECK(aclrtFree(param_buffer)); + + // reshape sin&cos + // TODO: ne[3] != 0 + int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1}; + size_t sin_reshape_nb[GGML_MAX_DIMS]; + sin_reshape_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; + } + aclTensor* acl_sin_reshape_tensor = create_acl_tensor(sin_buffer, ACL_FLOAT, + sizeof(float_t), + sin_reshape_ne, + sin_reshape_nb, + GGML_MAX_DIMS); + aclTensor* acl_cos_reshape_tensor = create_acl_tensor(cos_buffer, ACL_FLOAT, + sizeof(float_t), + sin_reshape_ne, + sin_reshape_nb, + GGML_MAX_DIMS); + + // TODO: warp the following as aclrtlaunch_ascendc_rope_cal<<<>>> + // roll input + void* input_roll_buffer; + aclTensor* acl_minus_one_tensor; + if (is_glm) { + // TODO + GGML_ASSERT(false); + } + else if (!is_neox) { + // roll input: [q0,q1,q2,...] -> [q1,q0,q3,q2...] + input_roll_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src0)); + int64_t input_roll_ne[4] = {2, src0->ne[1]*(src0->ne[0]/2), src0->ne[2], + src0->ne[3]}; + size_t input_roll_nb[GGML_MAX_DIMS]; + input_roll_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1]; + } + aclTensor* acl_input_roll_tensor = create_acl_tensor( + input_roll_buffer, + type_mapping(src0->type), + ggml_type_size(src0->type), + input_roll_ne, + input_roll_nb, + GGML_MAX_DIMS); + aclTensor* acl_input_tensor = create_acl_tensor( + src0->data, + type_mapping(src0->type), + ggml_type_size(src0->type), + input_roll_ne, + input_roll_nb, + GGML_MAX_DIMS); + + int64_t shifts[] = {1}; + int64_t dims[] = {3}; + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims, + dst); + ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + + // init [-1, 1, -1, 1, ...] + void* minus_one_scale_buffer = ctx.alloc_buffer(dst, sizeof(int64_t) + * src0->ne[0]); + int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; + size_t minus_one_nb[GGML_MAX_DIMS]; + minus_one_nb[0] = sizeof(int64_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; + } + acl_minus_one_tensor = create_acl_tensor(minus_one_scale_buffer, + ACL_INT64, sizeof(int64_t), + minus_one_ne, minus_one_nb, + GGML_MAX_DIMS); + int64_t* minus_one_scale = new int64_t[src0->ne[0]]; + for (int i=0; ine[0]; i+=2) { + minus_one_scale[i] = -1.0; + minus_one_scale[i+1] = 1.0; + } + + aclrtMemcpy(minus_one_scale_buffer, src0->ne[0] * sizeof(int64_t), + minus_one_scale, src0->ne[0] * sizeof(int64_t), + ACL_MEMCPY_HOST_TO_DEVICE); + delete[] minus_one_scale; + } + else { + // roll input: [q0,q1,q2,...] -> [q_half,q_half+1,..., q0,q1,...q_half-1] + input_roll_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src0)); + aclTensor* acl_input_roll_tensor = create_acl_tensor( + input_roll_buffer, + type_mapping(src0->type), + ggml_type_size(src0->type), + src0->ne, src0->nb, + GGML_MAX_DIMS); + aclTensor* acl_input_tensor = create_acl_tensor(src0); + + int64_t shifts[] = {src0->ne[0] / 2}; + int64_t dims[] = {3}; + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims, + dst); + ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + + // init [-1, -1, -1, 1, 1,1,...] + void* minus_one_scale_buffer = ctx.alloc_buffer(dst, sizeof(int64_t) + * src0->ne[0]); + int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; + size_t minus_one_nb[GGML_MAX_DIMS]; + minus_one_nb[0] = sizeof(int64_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; + } + acl_minus_one_tensor = create_acl_tensor(minus_one_scale_buffer, + ACL_INT64, sizeof(int64_t), + minus_one_ne, minus_one_nb, + GGML_MAX_DIMS); + int64_t* minus_one_scale = new int64_t[src0->ne[0]]; + for (int i=0; ine[0]; i++) { + if (i < src0->ne[0]/2) { + minus_one_scale[i] = -1.0; + } + else { + minus_one_scale[i] = 1.0; + } + } + + aclrtMemcpy(minus_one_scale_buffer, src0->ne[0] * sizeof(int64_t), + minus_one_scale, src0->ne[0] * sizeof(int64_t), + ACL_MEMCPY_HOST_TO_DEVICE); + delete[] minus_one_scale; + } + + // input * scale + void* input_roll_mul_scale_buffer = ctx.alloc_buffer(dst, + ggml_nbytes(src0)); + size_t input_nb[GGML_MAX_DIMS]; + input_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_nb[i] = input_nb[i - 1] * src0->ne[i - 1]; + } + aclTensor* acl_input_roll_mul_scale_tensor = create_acl_tensor( + input_roll_mul_scale_buffer, + type_mapping(src0->type), + ggml_type_size(src0->type), + src0->ne, input_nb, + GGML_MAX_DIMS); + aclTensor* acl_input_roll_reshape_tensor = create_acl_tensor( + input_roll_buffer, + type_mapping(src0->type), + ggml_type_size(src0->type), + src0->ne, input_nb, + GGML_MAX_DIMS); + aclnn_noinplcace_mul(ctx, acl_input_roll_reshape_tensor, + acl_minus_one_tensor, acl_input_roll_mul_scale_tensor, + dst); + + // output + aclTensor* acl_src0 = create_acl_tensor(src0); + aclTensor* acl_dst = create_acl_tensor(dst); + void* output_fp32_buffer; + if (src0->type == GGML_TYPE_F32) { + aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor, dst); + aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, + acl_sin_reshape_tensor, dst); + aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst, dst); + // TODO: zeta scaling for xPos + // TODO: ne0 != n_dims in mode2 + } + else if (src0->type == GGML_TYPE_F16) { + size_t input_fp32_nb[GGML_MAX_DIMS]; + input_fp32_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; + } + void* input_fp32_buffer1 = ctx.alloc_buffer(dst, ggml_nelements(dst) + * sizeof(float_t)); + aclTensor* input_fp32_tensor1 = create_acl_tensor(input_fp32_buffer1, + ACL_FLOAT, + sizeof(float_t), + dst->ne, + input_fp32_nb, + GGML_MAX_DIMS); + void* input_fp32_buffer2 = ctx.alloc_buffer(dst, ggml_nelements(dst) + * sizeof(float_t)); + aclTensor* input_fp32_tensor2 = create_acl_tensor(input_fp32_buffer2, + ACL_FLOAT, + sizeof(float_t), + dst->ne, + input_fp32_nb, + GGML_MAX_DIMS); + + output_fp32_buffer = ctx.alloc_buffer(dst, ggml_nelements(dst) + * sizeof(float_t)); + aclTensor* output_fp32_tensor = create_acl_tensor(output_fp32_buffer, + ACL_FLOAT, + sizeof(float_t), + dst->ne, + input_fp32_nb, + GGML_MAX_DIMS); + aclnn_noinplcace_mul(ctx, acl_src0, acl_cos_reshape_tensor, + input_fp32_tensor1, dst); + aclnn_noinplcace_mul(ctx, acl_input_roll_mul_scale_tensor, + acl_sin_reshape_tensor, input_fp32_tensor2, + dst); + aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, + output_fp32_tensor, dst); + aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16, dst); + + ACL_CHECK(aclDestroyTensor(input_fp32_tensor1)); + ACL_CHECK(aclDestroyTensor(input_fp32_tensor2)); + ACL_CHECK(aclDestroyTensor(output_fp32_tensor)); + } + + ACL_CHECK(aclDestroyTensor(acl_position_tensor)); + ACL_CHECK(aclDestroyTensor(acl_postion_cast_tensor)); + ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} \ No newline at end of file diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h new file mode 100644 index 0000000000000..56d975990e0f9 --- /dev/null +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -0,0 +1,182 @@ +#ifndef CANN_ACLNN_OPS +#define CANN_ACLNN_OPS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "acl_tensor.h" +#include "common.h" + +void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value); + +void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, + ggml_tensor* dst); + +template +void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + aclTensor* acl_src0; + aclTensor* acl_src1; + aclTensor* acl_dst; + + // Need bcast + if (!ggml_are_same_shape(src0, src1) && need_bcast(src0, src1)) { + BCAST_SHAPE(src0, src1) + acl_src0 = create_acl_tensor(src0, BCAST_PARAM(src0)); + acl_src1 = create_acl_tensor(src1, BCAST_PARAM(src1)); + acl_dst = create_acl_tensor(dst, BCAST_PARAM(src0)); + } else { + acl_src0 = create_acl_tensor(src0); + acl_src1 = create_acl_tensor(src1); + acl_dst = create_acl_tensor(dst); + } + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize, + &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(acl_src1)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +// Activation functions template. +template +void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +// Activation functions template for const aclTensors. +template +void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +#endif // CANN_ACLNN_OPS \ No newline at end of file diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h new file mode 100644 index 0000000000000..892a2259b10ae --- /dev/null +++ b/ggml/src/ggml-cann/common.h @@ -0,0 +1,141 @@ +#ifndef CANN_COMMON_H +#define CANN_COMMON_H + +#include + +#include +#include +#include +#include +#include + +#include "../include/ggml-cann.h" +#include "../include/ggml.h" + +#define MATRIX_ROW_PADDING 512 +#define GGML_CANN_MAX_STREAMS 8 + +[[noreturn]] void ggml_cann_error(const char* stmt, const char* func, + const char* file, int line, const char* msg); + +// Error handling macro +#define ACL_CHECK_GEN(stmt, success, error_fn) \ + do { \ + int err_code = (stmt); \ + if (err_code != (success)) { \ + ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \ + } \ + } while (0); + +#define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg) + +struct ggml_cann_device_info { + int32_t device_count; + + // TODO: add more device info later. + // struct cann_device_info { + // int cc; // compute capability + // size_t smpb; // max. shared memory per block + // bool vmm; // virtual memory support + // size_t vmm_granularity; // granularity of virtual memory + // size_t total_vram; + // }; + + // cann_device_info devices[GGML_CANN_MAX_DEVICES] = {}; +}; + +const ggml_cann_device_info& ggml_cann_info(); + +void ggml_cann_set_device(int32_t device); +int32_t ggml_cann_get_device(); + +struct ggml_backend_cann_context { + int32_t device; + std::string name; + aclrtEvent copy_event = nullptr; + + aclrtStream streams[GGML_CANN_MAX_STREAMS] = {{nullptr}}; + + // bind temp buffers to stream. Free after sync. + std::multimap buffers[GGML_CANN_MAX_STREAMS]; + + explicit ggml_backend_cann_context(int device) + : device(device), name(GGML_CANN_NAME + std::to_string(device)) {} + + ~ggml_backend_cann_context() { + if (copy_event != nullptr) { + ACL_CHECK(aclrtDestroyEvent(copy_event)); + } + for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) { + if (streams[i] != nullptr) { + ACL_CHECK(aclrtDestroyStream(streams[i])); + // Buffers should have been freed. + GGML_ASSERT(buffers[i].size() == 0); + } + } + } + + void* alloc_buffer(ggml_tensor* dst, size_t size, int stream) { + void* buffer; + ACL_CHECK(aclrtMalloc(&buffer, size, ACL_MEM_MALLOC_HUGE_FIRST)); + bind_buffer(dst, buffer, stream); + return buffer; + } + + void* alloc_buffer(ggml_tensor* dst, size_t size) { + return alloc_buffer(dst, size, 0); + } + + // Free all buffers bind to all streams. + void free_device_buffers() { + for (int i = 0; i < GGML_CANN_MAX_STREAMS; i++) { + for (auto& it : buffers[i]) { + ACL_CHECK(aclrtFree(it.second)); + } + buffers[i].clear(); + } + } + + // Free all buffers bind to stream. + void free_stream_buffers(int stream) { + for (auto& it : buffers[stream]) { + ACL_CHECK(aclrtFree(it.second)); + } + buffers[stream].clear(); + } + + // Free all buffers belong to dst. + // Remove it from stream buffers to avoid double free. + void free_tensor_buffers(ggml_tensor* dst) { + // ggml_tensor.extra means which stream are tensor in. + for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) { + if (streams[i] != nullptr) { + for (auto pos = buffers[i].equal_range(dst); + pos.first != pos.second; ++pos.first) { + ACL_CHECK(aclrtFree(pos.first->second)); + } + buffers[i].erase(dst); + } + } + } + + aclrtStream stream(int stream) { + if (streams[stream] == nullptr) { + ggml_cann_set_device(device); + ACL_CHECK(aclrtCreateStream(&streams[stream])); + } + return streams[stream]; + } + + // All temp buffers should bind to stream and the dst tensor. + // It will be free if: + // 1. dst tensor are no longer used any more. + // 2. after stream sync. + void bind_buffer(ggml_tensor* dst, void* buf, int stream) { + buffers[stream].insert(std::make_pair(dst, buf)); + } + + aclrtStream stream() { return stream(0); } +}; + +#endif // CANN_COMMON_H \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt new file mode 100644 index 0000000000000..c0119a3ad7908 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt @@ -0,0 +1,33 @@ +if (NOT SOC_TYPE) + set (SOC_TYPE "Ascend910B3") +endif() + +file(GLOB SRC_FILES + get_row_f32.cpp + get_row_f16.cpp + get_row_q4_0.cpp + get_row_q8_0.cpp + quantize_f32_q8_0.cpp + quantize_f16_q8_0.cpp + rope_init_cache.cpp + dup.cpp +) + +string(TOLOWER ${SOC_TYPE} SOC_VERSION) +set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) +set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels STATIC + ${SRC_FILES} +) + +#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h new file mode 100644 index 0000000000000..6ea8a97e79575 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h @@ -0,0 +1,20 @@ +#ifndef ASCENDC_KERNELS_H +#define ASCENDC_KERNELS_H + +#include "aclrtlaunch_ascendc_get_row_f32.h" +#include "aclrtlaunch_ascendc_get_row_f16.h" +#include "aclrtlaunch_ascendc_get_row_q8_0.h" +#include "aclrtlaunch_ascendc_get_row_q4_0.h" + +#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h" +#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h" + +#include "aclrtlaunch_ascendc_rope_init_cache.h" +#include "rope.h" +#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h" +#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" +#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h" +#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h" +#include "dup.h" + +#endif // ASCENDC_KERNELS_H \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp new file mode 100644 index 0000000000000..3da38d918bda7 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/dup.cpp @@ -0,0 +1,211 @@ +#include "kernel_operator.h" +#include "dup.h" + +#include + +using namespace AscendC; + +#define BUFFER_NUM 2 + +template +class DupByRows { + public: + __aicore__ inline DupByRows() {} + __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, dup_param& param) { + /* Dup by rows when src is contigous on first dimension and dst is + contiguous, each kernel process one row. + */ + + // Input has four dims. + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + // param + num_rows = param.src_ne[1] * param.src_ne[2] * param.src_ne[3]; + num_elem = param.src_ne[0]; + + // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3) + idx_ne3 = op_block_idx / (param.src_ne[1] * param.src_ne[2]); + idx_ne2 = (op_block_idx - idx_ne3 * (param.src_ne[1] * param.src_ne[2])) + / (param.src_ne[1]); + idx_ne1 = op_block_idx - idx_ne3 * (param.src_ne[1] * param.src_ne[2]) + - idx_ne2 * param.src_ne[1]; + + // src may not contiguous in dim [1,2,3], so stride decited by ne&nb + src_stride = param.src_nb[3] * idx_ne3 + param.src_nb[2] * idx_ne2 + + param.src_nb[1] * idx_ne1; + + // dst is contiguous + dst_stride = (idx_ne3 * (param.src_ne[1] * param.src_ne[2]) + + idx_ne2 * param.src_ne[1] + + idx_ne1) * (param.src_ne[0] * sizeof(DST_T)); + + src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src + + src_stride)); + dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst + + dst_stride)); + + pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem + + 32 - 1) / 32 * 32); + pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem + + 32 - 1) / 32 * 32); + } + + __aicore__ inline void copy_in() { + LocalTensor src_local = src_queue.AllocTensor(); + + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = num_elem * sizeof(SRC_T); + DataCopyPadExtParams padParams; + DataCopyPad(src_local, src_gm, dataCopyParams, padParams); + + src_queue.EnQue(src_local); + } + + __aicore__ inline void copy_out() { + LocalTensor dst_local = dst_queue.DeQue(); + + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = num_elem * sizeof(DST_T); + DataCopyPad(dst_gm, dst_local, dataCopyParams); + + dst_queue.FreeTensor(dst_local); + } + + __aicore__ inline void dup() { + // main process, copy one row data from src to dst. + copy_in(); + + LocalTensor src_local = src_queue.DeQue(); + LocalTensor dst_local = dst_queue.AllocTensor(); + + int32_t BLOCK_NUM = 32 / sizeof(DST_T); + DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1) + / BLOCK_NUM * BLOCK_NUM); + dst_queue.EnQue(dst_local); + + src_queue.FreeTensor(src_local); + copy_out(); + } + + __aicore__ inline void dup_with_cast() { + // main process, copy one row data from src to dst. + // cast dtype from src to dst. + copy_in(); + + LocalTensor src_local = src_queue.DeQue(); + LocalTensor dst_local = dst_queue.AllocTensor(); + + Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem); + dst_queue.EnQue(dst_local); + + src_queue.FreeTensor(src_local); + copy_out(); + } + + private: + + TPipe pipe; + GlobalTensor src_gm; + GlobalTensor dst_gm; + + int64_t num_rows; + int64_t num_elem; + int64_t idx_ne3; + int64_t idx_ne2; + int64_t idx_ne1; + int64_t src_stride; + int64_t dst_stride; + + TQue src_queue; + TQue dst_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR param) { + + // copy params from gm to ub. + dup_param param_ub; + auto param_gm_ptr = (__gm__ uint8_t*)param; + auto param_ub_ptr = (uint8_t*)¶m_ub; + + for (int32_t i = 0; i < static_cast(sizeof(dup_param) / sizeof(uint8_t)); + ++i, ++param_gm_ptr, ++param_ub_ptr) { + *param_ub_ptr = *param_gm_ptr; + } + + DupByRows op; + op.init(src_gm, dst_gm, param_ub); + op.dup(); +} + +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR param) { + + // copy params from gm to ub. + dup_param param_ub; + auto param_gm_ptr = (__gm__ uint8_t*)param; + auto param_ub_ptr = (uint8_t*)¶m_ub; + + for (int32_t i = 0; i < static_cast(sizeof(dup_param) / sizeof(uint8_t)); + ++i, ++param_gm_ptr, ++param_ub_ptr) { + *param_ub_ptr = *param_gm_ptr; + } + + DupByRows op; + op.init(src_gm, dst_gm, param_ub); + op.dup(); +} + +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16( + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR param) { + + // copy params from gm to ub. + dup_param param_ub; + auto param_gm_ptr = (__gm__ uint8_t*)param; + auto param_ub_ptr = (uint8_t*)¶m_ub; + + for (int32_t i = 0; i < static_cast(sizeof(dup_param) / sizeof(uint8_t)); + ++i, ++param_gm_ptr, ++param_ub_ptr) { + *param_ub_ptr = *param_gm_ptr; + } + + DupByRows op; + op.init(src_gm, dst_gm, param_ub); + op.dup_with_cast(); +} + +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32( + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR param) { + + // copy params from gm to ub. + dup_param param_ub; + auto param_gm_ptr = (__gm__ uint8_t*)param; + auto param_ub_ptr = (uint8_t*)¶m_ub; + + for (int32_t i = 0; i < static_cast(sizeof(dup_param) / sizeof(uint8_t)); + ++i, ++param_gm_ptr, ++param_ub_ptr) { + *param_ub_ptr = *param_gm_ptr; + } + + DupByRows op; + op.init(src_gm, dst_gm, param_ub); + op.dup_with_cast(); +} \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/dup.h b/ggml/src/ggml-cann/kernels/dup.h new file mode 100644 index 0000000000000..d58cd15ef2563 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/dup.h @@ -0,0 +1,14 @@ +#ifndef DUP_H +#define DUP_H + +#pragma pack(push, 8) +typedef struct { + int64_t src_ne[4]; + int64_t src_nb[4]; + int64_t dst_ne[4]; + int64_t dst_nb[4]; + +} dup_param; +#pragma pack(pop) + +#endif //DUP_H \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp new file mode 100644 index 0000000000000..352aba07e5d50 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp @@ -0,0 +1,186 @@ +#include "kernel_operator.h" + +// optimize me. Use template to avoid copy code. +using namespace AscendC; + +#define BUFFER_NUM 2 + +class GET_ROW_F16 { + public: + __aicore__ inline GET_ROW_F16() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *indices_ne_ub, size_t *indices_nb_ub, + int64_t *output_ne_ub, size_t *output_nb_ub) { + // TODO, use template for F16/f32 + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + + indices_ne[i] = indices_ne_ub[i]; + indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; + + output_ne[i] = output_ne_ub[i]; + output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; + } + + // Indices has two dims. n_elements = all rows should get. + // dr, all rows should this thread get. + uint64_t n_elements = + indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; + dr = n_elements / op_block_num; + + uint64_t tails = n_elements % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + input_gm.SetGlobalBuffer((__gm__ half *)input); + indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); + output_gm.SetGlobalBuffer((__gm__ float *)output); + + uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31) + & ~31); + uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31) + & ~31); + + local_buffer_elems = input_local_buffer_size / sizeof(half); + + // TODO, consider long row that can't put in UB. + // All data should asign to 32. It's ok because all data is align to 32. + pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size); + pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size); + } + + __aicore__ inline void copy_in(uint32_t offset, size_t len) { + LocalTensor input_local = input_queue.AllocTensor(); + size_t tail = len % 32; + len = len & ~31; + DataCopy(input_local, input_gm[offset], len); + if(tail != 0) { + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = tail * sizeof(half); + DataCopyPadExtParams padParams; + DataCopyPad(input_local[len], input_gm[offset + len], + dataCopyParams, padParams); + } + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset, size_t len) { + LocalTensor output_local = output_queue.DeQue(); + size_t tail = len % 32; + len = len & ~31; + DataCopy(output_gm[offset], output_local, len); + if(tail != 0) { + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = tail * sizeof(float); + DataCopyPad(output_gm[offset + len], output_local[len], + dataCopyParams); + } + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void calculate_row(int64_t idx) { + const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); + const int64_t indices_ne1_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / + indices_ne[0]; + const int64_t indices_ne0_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - + indices_ne1_idx * indices_ne[0]); + + const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + + indices_ne1_idx * indices_stride[1] + + indices_ne2_idx * indices_stride[2]; + const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); + + const int64_t input_offset = selected_row_idx * input_stride[1] + + indices_ne1_idx * input_stride[2] + + indices_ne2_idx * input_stride[3]; + + const int64_t output_offset = indices_ne0_idx * output_stride[1] + + indices_ne1_idx * output_stride[2] + + indices_ne2_idx * output_stride[3]; + + copy_in(input_offset, input_ne[0]); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + + Cast(output_local, input_local, RoundMode::CAST_NONE, + local_buffer_elems); + output_queue.EnQue(output_local); + copy_out(output_offset, input_ne[0]); + + input_queue.FreeTensor(input_local); + } + + __aicore__ inline void calculate() { + for (int64_t i = ir; i < ir + dr; i++) { + calculate_row(i); + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t indices_ne[4]; + size_t indices_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + size_t local_buffer_elems; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor indices_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_get_row_f16( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, + GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t indices_ne_ub[4]; + size_t indices_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(indices_ne_gm, indices_ne_ub, 32); + copy_to_ub(indices_nb_gm, indices_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + GET_ROW_F16 op; + op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, + indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); + op.calculate(); +} \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp new file mode 100644 index 0000000000000..d44d153b335ef --- /dev/null +++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp @@ -0,0 +1,180 @@ +#include "kernel_operator.h" + +// optimize me. Use template to avoid copy code. +using namespace AscendC; + +#define BUFFER_NUM 2 + +class GET_ROW_F32 { + public: + __aicore__ inline GET_ROW_F32() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *indices_ne_ub, size_t *indices_nb_ub, + int64_t *output_ne_ub, size_t *output_nb_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + + indices_ne[i] = indices_ne_ub[i]; + indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; + + output_ne[i] = output_ne_ub[i]; + output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; + } + + // Indices has two dims. n_elements = all rows should get. + // dr, all rows should this thread get. + uint64_t n_elements = + indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; + dr = n_elements / op_block_num; + + uint64_t tails = n_elements % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + input_gm.SetGlobalBuffer((__gm__ float *)input); + indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); + output_gm.SetGlobalBuffer((__gm__ float *)output); + + uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31); + local_buffer_elems = local_buffer_size / sizeof(float); + + // TODO, consider long row that can't put in UB. + // All data should asign to 32. It's ok because all data is align to 32. + pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size); + pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size); + } + + __aicore__ inline void copy_in(uint32_t offset, size_t len) { + LocalTensor input_local = input_queue.AllocTensor(); + size_t tail = len % 32; + len = len & ~31; + DataCopy(input_local, input_gm[offset], len); + if(tail != 0) { + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = tail * sizeof(float); + DataCopyPadExtParams padParams; + DataCopyPad(input_local[len], input_gm[offset + len], + dataCopyParams, padParams); + } + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset, size_t len) { + LocalTensor output_local = output_queue.DeQue(); + size_t tail = len % 32; + len = len & ~31; + DataCopy(output_gm[offset], output_local, len); + if(tail != 0) { + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = tail * sizeof(float); + DataCopyPad(output_gm[offset + len], output_local[len], + dataCopyParams); + } + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void calculate_row(int64_t idx) { + const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); + const int64_t indices_ne1_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / + indices_ne[0]; + const int64_t indices_ne0_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - + indices_ne1_idx * indices_ne[0]); + + const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + + indices_ne1_idx * indices_stride[1] + + indices_ne2_idx * indices_stride[2]; + const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); + + const int64_t input_offset = selected_row_idx * input_stride[1] + + indices_ne1_idx * input_stride[2] + + indices_ne2_idx * input_stride[3]; + + const int64_t output_offset = indices_ne0_idx * output_stride[1] + + indices_ne1_idx * output_stride[2] + + indices_ne2_idx * output_stride[3]; + + copy_in(input_offset, input_ne[0]); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + + DataCopy(output_local, input_local, local_buffer_elems); + output_queue.EnQue(output_local); + copy_out(output_offset, input_ne[0]); + + input_queue.FreeTensor(input_local); + } + + __aicore__ inline void calculate() { + for (int64_t i = ir; i < ir + dr; i++) { + calculate_row(i); + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t indices_ne[4]; + size_t indices_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + size_t local_buffer_elems; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor indices_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_get_row_f32( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, + GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t indices_ne_ub[4]; + size_t indices_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(indices_ne_gm, indices_ne_ub, 32); + copy_to_ub(indices_nb_gm, indices_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + GET_ROW_F32 op; + op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, + indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); + op.calculate(); +} \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp new file mode 100644 index 0000000000000..d3f229c1980b5 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp @@ -0,0 +1,193 @@ +#include "kernel_operator.h" + +// optimize me. Use template to avoid copy code. +using namespace AscendC; + +#define BUFFER_NUM 2 + +#define QK4_0 32 + +class GET_ROW_Q4_0 { + public: + __aicore__ inline GET_ROW_Q4_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, + int64_t *input_ne_ub, int64_t *indices_ne_ub, + size_t *indices_nb_ub, int64_t *output_ne_ub, + size_t *output_nb_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + indices_ne[i] = indices_ne_ub[i]; + indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; + scale_ne[i] = input_ne_ub[i]; + output_ne[i] = output_ne_ub[i]; + output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; + } + + // one scale for a group. + scale_ne[0] /= QK4_0; + + input_stride[0] = 1; + scale_stride[0] = 1; + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + group_size_in_row = input_ne[0] / QK4_0; + int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * + input_ne[3] / 2; + + // Indices has two dims. n_elements = all rows should get. + // dr, all rows should this thread get. + uint64_t n_elements = + indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; + dr = n_elements / op_block_num; + + uint64_t tails = n_elements % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + input_gm.SetGlobalBuffer((__gm__ int4b_t *)input); + scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); + indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); + output_gm.SetGlobalBuffer((__gm__ float *)output); + + pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t)); + pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half)); + pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float)); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error? + DataCopy(input_local, input_gm[offset], QK4_0); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + LocalTensor output_local = output_queue.DeQue(); + DataCopy(output_gm[offset], output_local, QK4_0); + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void calculate_group(int64_t idx, int64_t group) { + const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); + const int64_t indices_ne1_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / + indices_ne[0]; + const int64_t indices_ne0_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - + indices_ne1_idx * indices_ne[0]); + + const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + + indices_ne1_idx * indices_stride[1] + + indices_ne2_idx * indices_stride[2]; + const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); + + const int64_t input_offset = selected_row_idx * input_stride[1] + + indices_ne1_idx * input_stride[2] + + indices_ne2_idx * input_stride[3] + + group * QK4_0; + const int64_t scale_offset = selected_row_idx * scale_stride[1] + + indices_ne1_idx * scale_stride[2] + + indices_ne2_idx * scale_stride[3] + group; + const int64_t output_offset = indices_ne0_idx * output_stride[1] + + indices_ne1_idx * output_stride[2] + + indices_ne2_idx * output_stride[3] + + group * QK4_0; + + copy_in(input_offset); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor cast_local = cast_queue.AllocTensor(); + LocalTensor output_local = output_queue.AllocTensor(); + + // TODO: cast more data to speed up. + Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); + Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); + + // Only mul need compile by group. + half scale = scale_gm.GetValue(scale_offset); + + Muls(output_local, output_local, (float)scale, QK4_0); + + input_queue.FreeTensor(input_local); + cast_queue.FreeTensor(cast_local); + output_queue.EnQue(output_local); + + copy_out(output_offset); + } + + __aicore__ inline void calculate() { + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + calculate_group(i, j); + } + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t scale_ne[4]; + size_t scale_stride[4]; + + int64_t indices_ne[4]; + size_t indices_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t ir; + int64_t dr; + + int64_t group_size_in_row; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor indices_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue cast_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, + GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + int64_t indices_ne_ub[4]; + size_t indices_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(indices_ne_gm, indices_ne_ub, 32); + copy_to_ub(indices_nb_gm, indices_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + GET_ROW_Q4_0 op; + op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, + indices_nb_ub, output_ne_ub, output_nb_ub); + op.calculate(); +} \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp new file mode 100644 index 0000000000000..724669725b5cc --- /dev/null +++ b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp @@ -0,0 +1,191 @@ +#include "kernel_operator.h" + +// optimize me. Use template to avoid copy code. +using namespace AscendC; + +#define BUFFER_NUM 2 + +#define QK8_0 32 + +class GET_ROW_Q8_0 { + public: + __aicore__ inline GET_ROW_Q8_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, + int64_t *input_ne_ub, int64_t *indices_ne_ub, + size_t *indices_nb_ub, int64_t *output_ne_ub, + size_t *output_nb_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + indices_ne[i] = indices_ne_ub[i]; + indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; + scale_ne[i] = input_ne_ub[i]; + output_ne[i] = output_ne_ub[i]; + output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; + } + + // one scale for a group. + scale_ne[0] /= QK8_0; + + input_stride[0] = 1; + scale_stride[0] = 1; + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + group_size_in_row = input_ne[0] / QK8_0; + int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * + input_ne[3] * sizeof(int8_t); + + // Indices has two dims. n_elements = all rows should get. + // dr, all rows should this thread get. + uint64_t n_elements = + indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; + dr = n_elements / op_block_num; + + uint64_t tails = n_elements % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + input_gm.SetGlobalBuffer((__gm__ int8_t *)input); + scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); + indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); + output_gm.SetGlobalBuffer((__gm__ float *)output); + + pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); + pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half)); + pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float)); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + DataCopy(input_local, input_gm[offset], QK8_0); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + LocalTensor output_local = output_queue.DeQue(); + DataCopy(output_gm[offset], output_local, QK8_0); + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void calculate_group(int64_t idx, int64_t group) { + const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); + const int64_t indices_ne1_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / + indices_ne[0]; + const int64_t indices_ne0_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - + indices_ne1_idx * indices_ne[0]); + + const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + + indices_ne1_idx * indices_stride[1] + + indices_ne2_idx * indices_stride[2]; + const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); + + const int64_t input_offset = selected_row_idx * input_stride[1] + + indices_ne1_idx * input_stride[2] + + indices_ne2_idx * input_stride[3] + + group * QK8_0; + const int64_t scale_offset = selected_row_idx * scale_stride[1] + + indices_ne1_idx * scale_stride[2] + + indices_ne2_idx * scale_stride[3] + group; + const int64_t output_offset = indices_ne0_idx * output_stride[1] + + indices_ne1_idx * output_stride[2] + + indices_ne2_idx * output_stride[3] + + group * QK8_0; + + copy_in(input_offset); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor cast_local = cast_queue.AllocTensor(); + LocalTensor output_local = output_queue.AllocTensor(); + + // TODO: cast more data to speed up. + Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); + Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0); + + // Only mul need compile by group. + half scale = scale_gm.GetValue(scale_offset); + Muls(output_local, output_local, (float)scale, QK8_0); + + input_queue.FreeTensor(input_local); + cast_queue.FreeTensor(cast_local); + output_queue.EnQue(output_local); + + copy_out(output_offset); + } + + __aicore__ inline void calculate() { + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + calculate_group(i, j); + } + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t scale_ne[4]; + size_t scale_stride[4]; + + int64_t indices_ne[4]; + size_t indices_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t ir; + int64_t dr; + + int64_t group_size_in_row; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor indices_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue cast_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_get_row_q8_0( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, + GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + int64_t indices_ne_ub[4]; + size_t indices_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(indices_ne_gm, indices_ne_ub, 32); + copy_to_ub(indices_nb_gm, indices_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + GET_ROW_Q8_0 op; + op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, + indices_nb_ub, output_ne_ub, output_nb_ub); + op.calculate(); +} \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp new file mode 100644 index 0000000000000..5d33bf07a6303 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp @@ -0,0 +1,208 @@ +#include "kernel_operator.h" + +using namespace AscendC; + +#define BUFFER_NUM 2 +#define QK8_0 32 + +class QUANTIZE_F16_Q8_0 { + public: + __aicore__ inline QUANTIZE_F16_Q8_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *output_ne_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + + output_ne[i] = output_ne_ub[i]; + } + + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; + } + + scale_ne = input_ne; + scale_stride[0] = 1; + scale_stride[1] = input_ne[0] / QK8_0; + for (int i = 2; i < 4; i++) { + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + // split input tensor by rows. + uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; + dr = nr / op_block_num; + + uint64_t tails = nr % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + group_size_in_row = scale_stride[1]; + int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * + output_ne[3] * sizeof(uint8_t); + + input_gm.SetGlobalBuffer((__gm__ half *)input); + output_gm.SetGlobalBuffer((__gm__ int8_t *)output); + scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir * + group_size_in_row * + sizeof(half))); + + pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half)); + pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); + pipe.InitBuffer(work_queue, 1, 32); + pipe.InitBuffer(max_queue, 1, 32); + pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); + pipe.InitBuffer(scale_queue, 1, 32); + pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float)); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + DataCopy(input_local, input_gm[offset], QK8_0); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + LocalTensor output_local = output_queue.DeQue(); + DataCopy(output_gm[offset], output_local, QK8_0); + output_queue.FreeTensor(output_local); + } + + __aicore__ inline half calculate_group(int64_t row, int64_t group) { + const int64_t i3 = row / (input_ne[1] * input_ne[2]); + const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; + const int64_t i1 = + row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; + + const int64_t input_offset = i1 * input_stride[1] + + i2 * input_stride[2] + + i3 * input_stride[3] + QK8_0 * group; + + const int64_t output_offset = i1 * output_stride[1] + + i2 * output_stride[2] + + i3 * output_stride[3] + QK8_0 * group; + + copy_in(input_offset); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + LocalTensor work_local = work_queue.AllocTensor(); + LocalTensor abs_local = abs_queue.AllocTensor(); + LocalTensor max_local = max_queue.AllocTensor(); + LocalTensor cast_local = cast_queue.AllocTensor(); + + Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); + Abs(abs_local, cast_local, QK8_0); + ReduceMax(max_local, abs_local, work_local, QK8_0); + + pipe_barrier(PIPE_ALL); + float d = max_local.GetValue(0); + d = d / ((1 << 7) - 1); + if (d != 0) { + Muls(cast_local, cast_local, 1.0f / d, QK8_0); + } + + Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0); + Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0); + Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0); + output_queue.EnQue(output_local); + copy_out(output_offset); + + input_queue.FreeTensor(input_local); + work_queue.FreeTensor(work_local); + abs_queue.FreeTensor(abs_local); + max_queue.FreeTensor(max_local); + cast_queue.FreeTensor(cast_local); + return (half)d; + } + + __aicore__ inline void calculate() { + LocalTensor scale_local = scale_queue.AllocTensor(); + uint32_t scale_local_offset = 0; + uint32_t scale_global_offset = 0; + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + half scale = calculate_group(i, j); + scale_local.SetValue(scale_local_offset++, scale); + if (scale_local_offset == 16) { + scale_local_offset = 0; + // TODO: OPTIMIZE ME + pipe_barrier(PIPE_ALL); + DataCopy(scale_gm[scale_global_offset], scale_local, 16); + pipe_barrier(PIPE_ALL); + scale_global_offset += 16; + } + } + } + + if (scale_local_offset != 0) { + pipe_barrier(PIPE_ALL); + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = scale_local_offset * sizeof(half); + DataCopyPad(scale_gm[scale_global_offset], scale_local, + dataCopyParams); + pipe_barrier(PIPE_ALL); + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t *scale_ne; + size_t scale_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t group_size_in_row; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue work_queue; + TQue max_queue; + TQue abs_queue; + TQue scale_queue; + TQue cast_queue; + +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_F16_Q8_0 op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp new file mode 100644 index 0000000000000..bdb47231f04f3 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp @@ -0,0 +1,206 @@ +#include "kernel_operator.h" + +using namespace AscendC; + +#define BUFFER_NUM 2 +#define QK8_0 32 + +class QUANTIZE_F32_Q8_0 { + public: + __aicore__ inline QUANTIZE_F32_Q8_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *output_ne_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + + output_ne[i] = output_ne_ub[i]; + } + + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; + } + + scale_ne = input_ne; + scale_stride[0] = 1; + scale_stride[1] = input_ne[0] / QK8_0; + for (int i = 2; i < 4; i++) { + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + // split input tensor by rows. + uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; + dr = nr / op_block_num; + + uint64_t tails = nr % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + group_size_in_row = scale_stride[1]; + int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * + output_ne[3] * sizeof(uint8_t); + + input_gm.SetGlobalBuffer((__gm__ float *)input); + output_gm.SetGlobalBuffer((__gm__ int8_t *)output); + scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + + ir * group_size_in_row * + sizeof(half))); + + pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float)); + pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); + pipe.InitBuffer(work_queue, 1, 32); + pipe.InitBuffer(max_queue, 1, 32); + pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); + pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half)); + pipe.InitBuffer(scale_queue, 1, 32); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + DataCopy(input_local, input_gm[offset], QK8_0); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + LocalTensor output_local = output_queue.DeQue(); + DataCopy(output_gm[offset], output_local, QK8_0); + output_queue.FreeTensor(output_local); + } + + __aicore__ inline half calculate_group(int64_t row, int64_t group) { + const int64_t i3 = row / (input_ne[1] * input_ne[2]); + const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; + const int64_t i1 = + row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; + + const int64_t input_offset = i1 * input_stride[1] + + i2 * input_stride[2] + + i3 * input_stride[3] + QK8_0 * group; + + const int64_t output_offset = i1 * output_stride[1] + + i2 * output_stride[2] + + i3 * output_stride[3] + QK8_0 * group; + + copy_in(input_offset); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + LocalTensor work_local = work_queue.AllocTensor(); + LocalTensor abs_local = abs_queue.AllocTensor(); + LocalTensor max_local = max_queue.AllocTensor(); + LocalTensor cast_local = cast_queue.AllocTensor(); + + Abs(abs_local, input_local, QK8_0); + ReduceMax(max_local, abs_local, work_local, QK8_0); + pipe_barrier(PIPE_ALL); + float d = max_local.GetValue(0); + d = d / ((1 << 7) - 1); + if (d != 0) { + Muls(input_local, input_local, 1.0f / d, QK8_0); + } + + Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0); + Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0); + Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0); + output_queue.EnQue(output_local); + copy_out(output_offset); + + input_queue.FreeTensor(input_local); + work_queue.FreeTensor(work_local); + abs_queue.FreeTensor(abs_local); + max_queue.FreeTensor(max_local); + cast_queue.FreeTensor(cast_local); + + return (half)d; + } + + __aicore__ inline void calculate() { + LocalTensor scale_local = scale_queue.AllocTensor(); + uint32_t scale_local_offset = 0; + uint32_t scale_global_offset = 0; + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + half scale = calculate_group(i, j); + scale_local.SetValue(scale_local_offset++, scale); + if (scale_local_offset == 16) { + scale_local_offset = 0; + // TODO: OPTIMIZE ME + pipe_barrier(PIPE_ALL); + DataCopy(scale_gm[scale_global_offset], scale_local, 16); + pipe_barrier(PIPE_ALL); + scale_global_offset += 16; + } + } + } + + if (scale_local_offset != 0) { + pipe_barrier(PIPE_ALL); + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = scale_local_offset * sizeof(half); + DataCopyPad(scale_gm[scale_global_offset], scale_local, + dataCopyParams); + pipe_barrier(PIPE_ALL); + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t *scale_ne; + size_t scale_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t group_size_in_row; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue work_queue; + TQue max_queue; + TQue abs_queue; + TQue cast_queue; + TQue scale_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_F32_Q8_0 op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/rope.h b/ggml/src/ggml-cann/kernels/rope.h new file mode 100644 index 0000000000000..223da0595b0a7 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/rope.h @@ -0,0 +1,19 @@ +#ifndef ROPE_H +#define ROPE_H + +#pragma pack(push, 8) +typedef struct { + int64_t input_ne[4]; + int64_t position_ne[4]; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + int n_dims; + int n_orig_ctx; + float theta_scale; + float corr_dims[2]; + bool is_neox; + bool is_glm; + +} rope_param; +#pragma pack(pop) + +#endif //ROPE_H \ No newline at end of file diff --git a/ggml/src/ggml-cann/kernels/rope_init_cache.cpp b/ggml/src/ggml-cann/kernels/rope_init_cache.cpp new file mode 100644 index 0000000000000..1d55f8969a483 --- /dev/null +++ b/ggml/src/ggml-cann/kernels/rope_init_cache.cpp @@ -0,0 +1,276 @@ +#include "kernel_operator.h" +#include "rope.h" + +#include + +using namespace AscendC; + +#define BUFFER_NUM 1 + +class InitCache { + public: + __aicore__ inline InitCache() {} + __aicore__ inline void init(GM_ADDR position, + GM_ADDR sin_output, + GM_ADDR cos_output, + rope_param& param, + int64_t* input_ne_ub) { + /*Init sin&cos cache for rope, impl of ggml_compute_forward_rope_f32(). + each kernel process input_ne[0]*1 cache. + */ + + // Input has four dims. [batch, seq_len, heads, head_dim]. + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + // arange param + // head_dim = param.input_ne[0]; + // head_dim = param.input_ne[0]; + head_dim = input_ne_ub[0]; + first_value = 0; + diff_value = 1; + count = head_dim / 2; + + // power param + theta_scale = param.theta_scale; + + // broadcast param + // arange_shape: [count, 1] -> broadcast_shape0: [count, 2] + arange_shape[0] = count; + arange_shape[1] = 1; + broadcast_shape0[0] = count; + broadcast_shape0[1] = 2; + + // arange_shape1: [1, count] -> broadcast_shape2: [2, count] + arange_shape1[0] = 1; + arange_shape1[1] = count; + broadcast_shape2[0] = 2; + broadcast_shape2[1] = count; + + // position_shape: [1, 1] -> broadcast_shape1: [1, head_dim] + position_shape[0] = 1; + position_shape[1] = 1; + broadcast_shape1[0] = 1; + broadcast_shape1[1] = head_dim; + + // position raw and brcst size. + position_size = 1; + broadcast_size = broadcast_shape1[0] * broadcast_shape1[1]; + + // other param + attn_factor = param.attn_factor; + freq_scale = param.freq_scale; + is_neox = param.is_neox; + is_glm = param.is_glm; + + // stride + position_stride = op_block_idx; + output_stride = op_block_idx * broadcast_size; + + position_gm.SetGlobalBuffer((__gm__ float_t*)position + position_stride, + 1); + output_sin_gm.SetGlobalBuffer((__gm__ float_t*)sin_output + + output_stride, + broadcast_size); + output_cos_gm.SetGlobalBuffer((__gm__ float_t*)cos_output + + output_stride, + broadcast_size); + + pipe.InitBuffer(power_queue, BUFFER_NUM, + (sizeof(float_t)*count+32-1)/32*32); + pipe.InitBuffer(position_queue, BUFFER_NUM, + (sizeof(float_t)*position_size+32-1)/32*32); + pipe.InitBuffer(arange_queue, BUFFER_NUM, + (sizeof(float_t)*count+32-1)/32*32); + pipe.InitBuffer(sin_mul_mscale_queue, BUFFER_NUM, + (sizeof(float_t)*broadcast_size+32-1)/32*32); + pipe.InitBuffer(cos_mul_mscale_queue, BUFFER_NUM, + (sizeof(float_t)*broadcast_size+32-1)/32*32); + pipe.InitBuffer(broadcast_power_buffer, + (sizeof(float_t)*broadcast_size+32-1)/32*32); + pipe.InitBuffer(theta_buffer, + (sizeof(float_t)*broadcast_size+32-1)/32*32); + pipe.InitBuffer(sin_buffer, + (sizeof(float_t)*broadcast_size+32-1)/32*32); + pipe.InitBuffer(cos_buffer, + (sizeof(float_t)*broadcast_size+32-1)/32*32); + } + + __aicore__ inline void copy_in() { + LocalTensor input_local = + position_queue.AllocTensor(); + + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = position_size * sizeof(float_t); + DataCopyPadExtParams padParams; + DataCopyPad(input_local, position_gm, dataCopyParams, padParams); + + position_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out() { + LocalTensor sin_local = sin_mul_mscale_queue.DeQue(); + int32_t BLOCK_NUM = 32 / sizeof(float_t); + DataCopy(output_sin_gm, sin_local, (broadcast_size + BLOCK_NUM - 1) + / BLOCK_NUM * BLOCK_NUM); + + LocalTensor cos_local = cos_mul_mscale_queue.DeQue(); + DataCopy(output_cos_gm, cos_local, (broadcast_size + BLOCK_NUM - 1) + / BLOCK_NUM * BLOCK_NUM); + + sin_mul_mscale_queue.FreeTensor(sin_local); + cos_mul_mscale_queue.FreeTensor(cos_local); + } + + __aicore__ inline void calculate() { + + // arange + LocalTensor arange_local = arange_queue.AllocTensor(); + ArithProgression(arange_local, first_value, diff_value, count); + + // pow + LocalTensor power_local = power_queue.AllocTensor(); + Power(power_local, static_cast(theta_scale), + arange_local); + + LocalTensor power_brcast_local = + broadcast_power_buffer.Get(); + + //TODO: is_glm==true. + if (!is_glm && !is_neox) { + // for :dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta; + // dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; + // the value of 0,1 or 2,3, ..., should be same. + + // broadcast: e.g. arange [64, 1] -> [64, 2] + BroadCast(power_brcast_local, power_local, + broadcast_shape0, arange_shape); + // position: [1] + copy_in(); + LocalTensor position_local = + position_queue.DeQue(); + position_value = position_local.GetValue(0); + position_queue.FreeTensor(position_local); + } + else { + // for: dst_data[0] = x0*cos_theta - x1*sin_theta; + // dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + // the value of 0,n_dims/2 or 1,n/dims/2+1 should be same. + + // broadcast: e.g. arange [1, 64] -> [2, 64] + BroadCast(power_brcast_local, power_local, + broadcast_shape2, arange_shape1); + + // position * freq_scale + copy_in(); + LocalTensor position_local = + position_queue.DeQue(); + position_value = position_local.GetValue(0); + position_value = position_value * freq_scale; + position_queue.FreeTensor(position_local); + } + + // theta + LocalTensor theta_local = theta_buffer.Get(); + Muls(theta_local, power_brcast_local, position_value, + broadcast_size); + + // sin & cos + // TODO: if ext_factor != 0 + LocalTensor sin_local = sin_buffer.Get(); + Sin(sin_local, theta_local); + LocalTensor sin_mul_mscale_local = + sin_mul_mscale_queue.AllocTensor(); + Muls(sin_mul_mscale_local, sin_local, attn_factor, broadcast_size); + + LocalTensor cos_local = cos_buffer.Get(); + Cos(cos_local, theta_local); + LocalTensor cos_mul_mscale_local = + cos_mul_mscale_queue.AllocTensor(); + Muls(cos_mul_mscale_local, cos_local, attn_factor, broadcast_size); + + // release, VECCALC not need. + arange_queue.FreeTensor(arange_local); + power_queue.FreeTensor(power_local); + + // output + sin_mul_mscale_queue.EnQue(sin_mul_mscale_local); + cos_mul_mscale_queue.EnQue(cos_mul_mscale_local); + copy_out(); + } + + private: + + int64_t head_dim; + float_t first_value; + float_t diff_value; + int32_t count; + float_t theta_scale; + float_t attn_factor; + float_t freq_scale; + bool is_neox; + bool is_glm; + + uint32_t broadcast_shape0[2]; + uint32_t broadcast_shape1[2]; + uint32_t broadcast_shape2[2]; + uint32_t position_shape[2]; + uint32_t arange_shape[2]; + uint32_t arange_shape1[2]; + int64_t broadcast_size; + int64_t position_size; + int64_t position_stride; + int64_t output_stride; + float_t position_value; + + TPipe pipe; + GlobalTensor position_gm; + GlobalTensor output_sin_gm; + GlobalTensor output_cos_gm; + TQue arange_queue; + TQue power_queue; + TQue position_queue; + TQue sin_mul_mscale_queue; + TQue cos_mul_mscale_queue; + TBuf broadcast_power_buffer; + TBuf theta_buffer; + TBuf sin_buffer; + TBuf cos_buffer; + +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, int32_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_rope_init_cache( + GM_ADDR position_gm, + GM_ADDR output_sin_gm, + GM_ADDR output_cos_gm, + GM_ADDR param, + GM_ADDR input_ne_gm + ) { + // copy params from gm to ub. + rope_param param_ub; + auto param_gm_ptr = (__gm__ uint8_t*)param; + auto param_ub_ptr = (uint8_t*)¶m_ub; + + for (int32_t i = 0; i < static_cast(sizeof(rope_param) / sizeof(uint8_t)); + ++i, ++param_gm_ptr, ++param_ub_ptr) { + *param_ub_ptr = *param_gm_ptr; + } + + int64_t input_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + + InitCache op; + op.init(position_gm, output_sin_gm, output_cos_gm, param_ub, input_ne_ub); + op.calculate(); +} \ No newline at end of file diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f5502afbe98b3..78fb3c7d9cd84 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3289,7 +3289,7 @@ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tenso } // check if t1 can be represented as a repeatition of t0 -static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { +bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return ggml_is_empty(t0) ? ggml_is_empty(t1) : @@ -13563,6 +13563,7 @@ static void ggml_compute_forward_soft_max( } } + // ggml_compute_forward_soft_max_back static void ggml_compute_forward_soft_max_back_f32( @@ -21855,6 +21856,14 @@ int ggml_cpu_has_rpc(void) { #endif } +int ggml_cpu_has_cann(void) { +#if defined(GGML_USE_CANN) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_gpublas(void) { return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl(); } diff --git a/src/llama.cpp b/src/llama.cpp index 3dc0f85351c50..2ace2c2c921b1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -19,6 +19,8 @@ # include "ggml-sycl.h" #elif defined(GGML_USE_KOMPUTE) # include "ggml-kompute.h" +#elif defined(GGML_USE_CANN) +# include "ggml-cann.h" #endif #ifdef GGML_USE_BLAS @@ -2639,6 +2641,8 @@ static size_t llama_get_device_count(const llama_model & model) { count = ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) count = ggml_backend_vk_get_device_count(); +#elif defined(GGML_USE_CANN) + return ggml_backend_cann_get_device_count(); #endif #if defined(GGML_USE_RPC) count += model.rpc_servers.size(); @@ -2671,6 +2675,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ if (buft == nullptr) { LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); } +#elif defined(GGML_USE_CANN) + buft = ggml_backend_cann_buffer_type(gpu); #endif if (buft == nullptr) { @@ -2731,6 +2737,11 @@ static size_t llama_get_device_memory(const llama_model & model, int device) { size_t free; ggml_backend_vk_get_device_memory(device, &free, &total); return free; +#elif defined(GGML_USE_CANN) + size_t total; + size_t free; + ggml_backend_cann_get_device_memory(device, &total, &free); + return free; #else return 1; #endif @@ -17059,6 +17070,8 @@ size_t llama_max_devices(void) { return GGML_SYCL_MAX_DEVICES; #elif defined(GGML_USE_VULKAN) return GGML_VK_MAX_DEVICES; +#elif defined(GGML_USE_CANN) + return GGML_CANN_MAX_DEVICES; #else return 1; #endif @@ -17091,6 +17104,10 @@ void llama_backend_init(void) { struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } + +#if defined(GGML_USE_CANN) + ggml_cann_backend_init(); +#endif } void llama_numa_init(enum ggml_numa_strategy numa) { @@ -17100,6 +17117,10 @@ void llama_numa_init(enum ggml_numa_strategy numa) { } void llama_backend_free(void) { +#if defined(GGML_USE_CANN) + ggml_cann_backend_free(); +#endif + ggml_quantize_free(); } @@ -17389,6 +17410,30 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#elif defined(GGML_USE_CANN) + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + // TODO: ggml_backend_cann is not support split tensor now, just leave code here. + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU + // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version. + for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_cann_init(device); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } + } #endif #ifdef GGML_USE_BLAS diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index cfa7073153486..f0b50ba1f5fd4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -123,6 +123,7 @@ llama_target_and_test(test-grammar-integration.cpp) llama_target_and_test(test-grad0.cpp) # llama_target_and_test(test-opt.cpp) # SLOW llama_target_and_test(test-backend-ops.cpp) +llama_target_and_test(test-backend-runtime.cpp) llama_target_and_test(test-rope.cpp) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f74c0db475e2e..2906e831815af 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -759,7 +759,7 @@ struct test_dup : public test_case { } test_dup(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 1}, + std::array ne = {10, 10, 20, 1}, std::array permute = {0, 0, 0, 0}) : type(type), ne(ne), permute(permute), _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {} @@ -779,9 +779,12 @@ struct test_cpy : public test_case { const ggml_type type_src; const ggml_type type_dst; const std::array ne; + const std::array permute; + bool _src_use_permute; + bool _dst_use_permute; std::string vars() override { - return VARS_TO_STR3(type_src, type_dst, ne); + return VARS_TO_STR6(type_src, type_dst, ne, permute, _src_use_permute, _dst_use_permute); } double max_nmse_err() override { @@ -793,12 +796,27 @@ struct test_cpy : public test_case { } test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 1}) - : type_src(type_src), type_dst(type_dst), ne(ne) {} + std::array ne = {10, 10, 10, 1}, + std::array permute = {0, 0, 0, 0}, + bool _dst_use_permute = false) + : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute), + _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0), + _dst_use_permute(_dst_use_permute) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); - ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data()); + ggml_tensor * dst; + if (_src_use_permute) { + src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]); + } + if (_src_use_permute && _dst_use_permute) { + dst = ggml_new_tensor(ctx, type_src, 4, ne.data()); + dst = ggml_permute(ctx, dst, permute[0], permute[1], permute[2], permute[3]); + } + else { + int64_t* dst_ne = src->ne; + dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne); + } ggml_tensor * out = ggml_cpy(ctx, src, dst); return out; } @@ -1174,6 +1192,7 @@ struct test_soft_max : public test_case { } }; + // GGML_OP_ROPE struct test_rope : public test_case { const ggml_type type; @@ -2109,12 +2128,24 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_dup(GGML_TYPE_F16)); test_cases.emplace_back(new test_dup(GGML_TYPE_I32)); test_cases.emplace_back(new test_dup(GGML_TYPE_I16)); + test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows + test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst no contiguous test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3})); for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // test cpy by rows + } + } + + for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { + for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) { + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // dst contiguous + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3}, true)); // dst no contiguous } } @@ -2246,7 +2277,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (int n = 0; n < 10; ++n) { int64_t ne0 = dist_ne0(rng); int64_t ne1 = dist_ne1(rng); - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f)); } exponent <<= 1; @@ -2265,7 +2296,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } } } - + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f)); diff --git a/tests/test-backend-runtime.cpp b/tests/test-backend-runtime.cpp new file mode 100644 index 0000000000000..c08f81500d56a --- /dev/null +++ b/tests/test-backend-runtime.cpp @@ -0,0 +1,185 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +struct test_case { + virtual const char* case_desc() = 0; + virtual bool eval(ggml_backend_t backend) = 0; + std::vector get_random_float(size_t size) { + std::vector random_data; + random_data.resize(size); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0, 128); + for (size_t i = 0; i < size; i++) { + random_data[i] = dis(gen); + } + return random_data; + } + + void init_context(size_t tensor_count) { + ggml_init_params params = { + /* .mem_size = */ ggml_tensor_overhead() * tensor_count, + /* .mem_base = */ NULL, + /* .no_alloc = */ true, + }; + ctx = ggml_init(params); + } + + ggml_tensor* new_tensor(ggml_backend_t backend, int dims, int64_t* ne) { + ggml_tensor* tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, dims, ne); + ggml_backend_buffer_t buf = + ggml_backend_alloc_ctx_tensors(ctx, backend); + if (buf == NULL) { + printf("failed to allocate tensors [%s] ", + ggml_backend_name(backend)); + ggml_free(ctx); + return nullptr; + } + return tensor; + } + + ggml_context* ctx; +}; + +struct test_tensor_get_set_cpy_async : public test_case { + virtual const char* case_desc() { return "test_tensor_get_set_cpy_async"; } + virtual bool eval(ggml_backend_t backend) { + // init context + init_context(2); + + // alloc tensor + int64_t ne[] = {10, 10}; + ggml_tensor* tensor1 = + new_tensor(backend, sizeof(ne) / sizeof(ne[0]), ne); + ggml_tensor* tensor2 = + new_tensor(backend, sizeof(ne) / sizeof(ne[0]), ne); + + // get random data + int64_t elements = ggml_nelements(tensor1); + std::vector random_data = get_random_float(elements); + std::vector verify_data; + verify_data.resize(elements); + + // upload and download data + ggml_backend_tensor_set_async(backend, tensor1, + (void*)random_data.data(), 0, + ggml_nbytes(tensor1)); + ggml_backend_tensor_copy_async(backend, backend, tensor1, tensor2); + ggml_backend_tensor_get_async(backend, tensor2, + (void*)verify_data.data(), 0, + ggml_nbytes(tensor2)); + ggml_backend_synchronize(backend); + + return (memcmp(random_data.data(), verify_data.data(), + sizeof(float) * elements) == 0); + } +}; + +struct test_tensor_get_set_cpy : public test_case { + virtual const char* case_desc() { return "test_tensor_get_set_cpy"; } + virtual bool eval(ggml_backend_t backend) { + // init context + init_context(2); + + // alloc tensor + int64_t ne[] = {10, 10}; + ggml_tensor* tensor1 = + new_tensor(backend, sizeof(ne) / sizeof(ne[0]), ne); + ggml_tensor* tensor2 = + new_tensor(backend, sizeof(ne) / sizeof(ne[0]), ne); + + // get random data + int64_t elements = ggml_nelements(tensor1); + std::vector random_data = get_random_float(elements); + std::vector verify_data; + verify_data.resize(elements); + + // upload and download data + ggml_backend_tensor_set(tensor1, (void*)random_data.data(), 0, + ggml_nbytes(tensor1)); + ggml_backend_tensor_copy(tensor1, tensor2); + ggml_backend_tensor_get(tensor2, (void*)verify_data.data(), 0, + ggml_nbytes(tensor2)); + + return (memcmp(random_data.data(), verify_data.data(), + sizeof(float) * elements) == 0); + } +}; + +static bool test_backend(ggml_backend_t backend) { + std::vector> test_cases; + test_cases.emplace_back(new test_tensor_get_set_cpy_async()); + test_cases.emplace_back(new test_tensor_get_set_cpy()); + + size_t n_ok = 0; + for (auto& test : test_cases) { + printf(" %s ", test->case_desc()); + if (test->eval(backend)) { + n_ok++; + printf("\033[1;32mOK\033[0m\n"); + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + } + printf(" %zu/%zu tests passed\n", n_ok, test_cases.size()); + + return n_ok == test_cases.size(); +} + +static void usage(char** argv) { printf("Usage: %s [-b backend]\n", argv[0]); } + +int main(int argc, char** argv) { + const char* backend = NULL; + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-b") == 0) { + if (i + 1 < argc) { + backend = argv[++i]; + } else { + usage(argv); + return 1; + } + } else { + usage(argv); + return 1; + } + } + + printf("Testing %zu backends\n\n", ggml_backend_reg_get_count()); + + size_t n_ok = 0; + for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) { + if (backend != NULL && + strcmp(backend, ggml_backend_reg_get_name(i)) != 0) { + printf(" Skipping %s\n\n", ggml_backend_reg_get_name(i)); + n_ok++; + continue; + } + + ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL); + + GGML_ASSERT(backend != NULL); + printf(" Backend name: %s\n", ggml_backend_name(backend)); + + bool ok = test_backend(backend); + + printf(" Backend %s: ", ggml_backend_name(backend)); + if (ok) { + printf("\033[1;32mOK\033[0m\n"); + n_ok++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + + printf("\n"); + } + + return 0; +} \ No newline at end of file