From 63947bca573b7686382c99cc5f156f16404df182 Mon Sep 17 00:00:00 2001 From: huafengchun Date: Wed, 10 Jul 2024 03:37:05 +0000 Subject: [PATCH] add cann legacy mempool --- ggml/src/ggml-cann.cpp | 145 +++++--- ggml/src/ggml-cann/acl_ops.cpp | 80 ----- ggml/src/ggml-cann/acl_tensor.cpp | 1 + ggml/src/ggml-cann/aclnn_ops.cpp | 541 +++++++++++++++++------------- ggml/src/ggml-cann/aclnn_ops.h | 9 +- ggml/src/ggml-cann/common.h | 119 +++---- src/llama.cpp | 2 + 7 files changed, 482 insertions(+), 415 deletions(-) delete mode 100644 ggml/src/ggml-cann/acl_ops.cpp diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 65ec392aab4f1d..97683f401bcdb8 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -8,7 +8,6 @@ #include #include "ggml-backend-impl.h" -#include "ggml-cann/acl_ops.h" #include "ggml-cann/aclnn_ops.h" #include "ggml-cann/common.h" @@ -64,30 +63,122 @@ const ggml_cann_device_info& ggml_cann_info() { return info; } +#define DEBUG_CANN_MALLOC + +// buffer pool for cann (legacy) +struct ggml_cann_pool_leg : public ggml_cann_pool { + static const int MAX_BUFFERS = 256; + + int device; + struct ggml_cann_buffer { + void * ptr = nullptr; + size_t size = 0; + }; + + ggml_cann_buffer buffer_pool[MAX_BUFFERS] = {}; + size_t pool_size = 0; + + explicit ggml_cann_pool_leg(int device) : + device(device) { + } + + ~ggml_cann_pool_leg() { + ggml_cann_set_device(device); + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cann_buffer & b = buffer_pool[i]; + if (b.ptr != nullptr) { + ACL_CHECK(aclrtFree(b.ptr)); + pool_size -= b.size; + } + } + GGML_ASSERT(pool_size == 0); + } + + void * alloc(size_t size, size_t * actual_size) override { +#ifdef DEBUG_CANN_MALLOC + int nnz = 0; + size_t max_size = 0; +#endif + size_t best_diff = 1ull << 36; + int ibest = -1; + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cann_buffer& b = buffer_pool[i]; + if (b.ptr != nullptr) { +#ifdef DEBUG_CANN_MALLOC + ++nnz; + if (b.size > max_size) max_size = b.size; +#endif + if (b.size >= size) { + size_t diff = b.size - size; + if (diff < best_diff) { + best_diff = diff; + ibest = i; + if (!best_diff) { + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + } + } + } + if (ibest >= 0) { + ggml_cann_buffer& b = buffer_pool[ibest]; + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + void * ptr; + size_t look_ahead_size = (size_t) (1.05 * size); + look_ahead_size = 256 * ((look_ahead_size + 255)/256); + ggml_cann_set_device(device); + ACL_CHECK(aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST)); + *actual_size = look_ahead_size; + pool_size += look_ahead_size; +#ifdef DEBUG_CANN_MALLOC + printf("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, + (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024)); +#endif + return ptr; + } + + void free(void * ptr, size_t size) override { + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cann_buffer& b = buffer_pool[i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + // memory should always buffered. these memory may still needed by + // tasks in stream. + // TODO, fix me. + GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); + } +}; + +std::unique_ptr ggml_backend_cann_context::new_pool_for_device(int device) { + return std::unique_ptr(new ggml_cann_pool_leg(device)); +} + // cann buffer struct ggml_backend_cann_buffer_context { int32_t device; void* dev_ptr = nullptr; std::string name; - std::vector dev_extra_ptrs; ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr) : device(device), dev_ptr(dev_ptr), name(GGML_CANN_NAME + std::to_string(device)) {} - void* get_extra_ptr(size_t size) { - void* buffer; - ACL_CHECK(aclrtMalloc(&buffer, size, ACL_MEM_MALLOC_HUGE_FIRST)); - dev_extra_ptrs.push_back(buffer); - return buffer; - } - ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); - for (auto dev_extra_ptr : dev_extra_ptrs) { - ACL_CHECK(aclrtFree(dev_extra_ptr)); - } } }; @@ -270,32 +361,10 @@ GGML_CALL static bool need_transform(ggml_type type) { } } -static void set_tensor_extra(ggml_backend_buffer_t buffer, - ggml_tensor* tensor) { - // if tensor is need transform, make sure all meta data are copied to - // npu. - // TODO: All tensors should copy meta data to npu, but extra is used to - // record memory usage. Only used for perf test. - size_t tensor_meta_size = sizeof(ggml_tensor); - ggml_backend_cann_buffer_context* ctx = - (ggml_backend_cann_buffer_context*)buffer->context; - tensor->extra = ctx->get_extra_ptr(tensor_meta_size); - ACL_CHECK(aclrtMemcpy(tensor->extra, tensor_meta_size, tensor, - tensor_meta_size, ACL_MEMCPY_HOST_TO_DEVICE)); -} - -static void update_tensor_extra(ggml_tensor* tensor) { - // when tensor->ne/nb changed, make sure ne/nb in extra data also changed. - size_t tensor_meta_size = sizeof(ggml_tensor); - ACL_CHECK(aclrtMemcpy(tensor->extra, tensor_meta_size, tensor, - tensor_meta_size, ACL_MEMCPY_HOST_TO_DEVICE)); -} - GGML_CALL static void ggml_backend_cann_buffer_init_tensor( ggml_backend_buffer_t buffer, ggml_tensor* tensor) { if (tensor->view_src != NULL && tensor->view_offs == 0) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); - set_tensor_extra(buffer, tensor); return; } @@ -313,7 +382,6 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor( memset_size, 0, memset_size)); } } - set_tensor_extra(buffer, tensor); } // TODO: need handle tensor which pas paddings. @@ -650,7 +718,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - update_tensor_extra(dst); break; case GGML_OP_DIAG_MASK_INF: ggml_cann_diag_mask(ctx, dst, -INFINITY); @@ -692,7 +759,6 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; ACL_CHECK(aclrtSynchronizeDevice()); - cann_ctx->free_device_buffers(); ACL_CHECK(aclrtResetDevice(cann_ctx->device)); delete cann_ctx; delete backend; @@ -837,9 +903,6 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) { ggml_cann_set_device(cann_ctx->device); ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); - - // Free temp buffers binding to stream. - cann_ctx->free_stream_buffers(0); } GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( @@ -856,8 +919,6 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( continue; } - // if tensor is reused, free temp buffers first. - cann_ctx->free_tensor_buffers(node); bool ok = ggml_cann_compute_forward(*cann_ctx, node); if (!ok) { diff --git a/ggml/src/ggml-cann/acl_ops.cpp b/ggml/src/ggml-cann/acl_ops.cpp deleted file mode 100644 index fac9ea1ae4c3d2..00000000000000 --- a/ggml/src/ggml-cann/acl_ops.cpp +++ /dev/null @@ -1,80 +0,0 @@ -#include "acl_ops.h" - -OpCaller::OpCaller() { attrs = aclopCreateAttr(); } - -OpCaller::~OpCaller() { - for (aclTensorDesc* desc : input_descs) { - aclDestroyTensorDesc(desc); - } - for (aclDataBuffer* buffer : input_buffers) { - aclDestroyDataBuffer(buffer); - } - for (aclTensorDesc* desc : output_descs) { - aclDestroyTensorDesc(desc); - } - for (aclDataBuffer* buffer : output_buffers) { - aclDestroyDataBuffer(buffer); - } - aclopDestroyAttr(attrs); -} - -OpCaller& OpCaller::name(std::string _op_name) { - op_name = _op_name; - return *this; -} - -OpCaller& OpCaller::input_no_contiguous(ggml_tensor* tensor, const char* name) { - aclDataType dtype = type_mapping(tensor->type); - // TODO - int64_t ne[] = {tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]}; - aclTensorDesc* tensor_desc = - aclCreateTensorDesc(dtype, GGML_MAX_DIMS, ne, ACL_FORMAT_ND); - aclSetTensorDescName(tensor_desc, name); - input_descs.push_back(tensor_desc); - aclDataBuffer* data_buffer = - aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor)); - input_buffers.push_back(data_buffer); - return *this; -} - -OpCaller& OpCaller::input(ggml_tensor* tensor, const char* name) { - GGML_ASSERT(ggml_is_contiguous(tensor)); - return input_no_contiguous(tensor, name); -} - -OpCaller& OpCaller::output(ggml_tensor* tensor, const char* name) { - aclDataType dtype = type_mapping(tensor->type); - aclTensorDesc* tensor_desc = - aclCreateTensorDesc(dtype, GGML_MAX_DIMS, tensor->ne, ACL_FORMAT_ND); - aclSetTensorDescName(tensor_desc, name); - output_descs.push_back(tensor_desc); - aclDataBuffer* data_buffer = - aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor)); - output_buffers.push_back(data_buffer); - return *this; -} - -OpCaller& OpCaller::attr(int64_t value, const char* name) { - ACL_CHECK(aclopSetAttrInt(attrs, name, value)); - return *this; -} - -OpCaller& OpCaller::attr(bool value, const char* name) { - ACL_CHECK(aclopSetAttrBool(attrs, name, value)); - return *this; -} - -OpCaller& OpCaller::attr(float value, const char* name) { - ACL_CHECK(aclopSetAttrFloat(attrs, name, value)); - return *this; -} - -OpCaller& OpCaller::run(aclrtStream stream) { - ACL_CHECK(aclSetCompileopt(ACL_OP_JIT_COMPILE, "disable")); - ACL_CHECK(aclopCompileAndExecute( - op_name.c_str(), input_descs.size(), input_descs.data(), - input_buffers.data(), output_buffers.size(), output_descs.data(), - output_buffers.data(), attrs, ACL_ENGINE_SYS, ACL_COMPILE_SYS, nullptr, - stream)); - return *this; -} diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp index 04e0077c92be16..5285843c18e7c3 100644 --- a/ggml/src/ggml-cann/acl_tensor.cpp +++ b/ggml/src/ggml-cann/acl_tensor.cpp @@ -24,6 +24,7 @@ aclDataType type_mapping(ggml_type type) { return ACL_DT_UNDEFINED; } + /** * Transform ggml_tensor to acl_tensor. Note that ggml_tensor dimension order * is reversed compared to acl_tensor. diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index fb1d1980a23898..175949c2109c72 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -32,8 +32,7 @@ #include "kernels/ascendc_kernels.h" static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* repeat_array, - ggml_tensor* bind_tensor) { + aclTensor* acl_dst, int64_t* repeat_array) { // repeat tensor along each dim with repeat_array aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS); @@ -46,7 +45,12 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + // Memory from allocator will "free" immediately, but this memory + // will be distribute to other pointers, but it won't access before + // this async task end. + // All tasks in same stream will execute in queue. + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream stream = ctx.stream(); ACL_CHECK(aclnnRepeat(workspaceAddr, workspaceSize, executor, stream)); @@ -68,14 +72,13 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]}; - aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray, dst); + aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); } static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, - aclTensor* acl_src1, aclTensor* acl_dst, - ggml_tensor* bind_tensor) { + aclTensor* acl_src1, aclTensor* acl_dst) { // add: dst = acl_src0 + alpha*acl_src1 aclScalar* alpha = nullptr; @@ -89,7 +92,8 @@ static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -119,7 +123,7 @@ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) { acl_dst = create_acl_tensor(dst); } - aclnn_add(ctx, acl_src0, acl_src1, acl_dst, dst); + aclnn_add(ctx, acl_src0, acl_src1, acl_dst); ACL_CHECK(aclDestroyTensor(acl_src0)); ACL_CHECK(aclDestroyTensor(acl_src1)); @@ -147,7 +151,8 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnLeakyReluGetWorkspaceSize( acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -160,8 +165,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } static void aclnn_concat(ggml_backend_cann_context& ctx, aclTensorList* tensorList, - aclTensor* acl_dst, int64_t concat_dim, - ggml_tensor* bind_tensor) { + aclTensor* acl_dst, int64_t concat_dim) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -170,7 +174,8 @@ static void aclnn_concat(ggml_backend_cann_context& ctx, aclTensorList* tensorLi ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -187,15 +192,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t concat_dim = 1; aclTensor* tensors[] = {acl_src0, acl_src1}; aclTensorList* tensorList = aclCreateTensorList(tensors, 2); - aclnn_concat(ctx, tensorList, acl_dst, concat_dim, dst); + aclnn_concat(ctx, tensorList, acl_dst, concat_dim); ACL_CHECK(aclDestroyTensorList(tensorList)); ACL_CHECK(aclDestroyTensor(acl_dst)); } static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst, - float start, float stop, float step, int64_t n_elements, - ggml_tensor* bind_tensor) { + float start, float stop, float step, int64_t n_elements) { // arange: [start, stop), out(i+1) = out(i) + step. int64_t steps = (int64_t)std::ceil((stop - start) / step); @@ -212,7 +216,8 @@ static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst, ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -236,7 +241,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) { memcpy(&stop, (float*)dst->op_params + 1, sizeof(float)); memcpy(&step, (float*)dst->op_params + 2, sizeof(float)); - aclnn_arange(ctx, acl_dst, start, stop, step, n_elements, dst); + aclnn_arange(ctx, acl_dst, start, stop, step, n_elements); ACL_CHECK(aclDestroyTensor(acl_dst)); } @@ -270,7 +275,8 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -302,7 +308,8 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -319,7 +326,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src = create_acl_tensor(src); aclTensor* acl_dst = create_acl_tensor(dst); - void* buffer = ctx.alloc_buffer(dst, ggml_nelements(dst) * sizeof(int64_t)); + ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(int64_t)); + void* buffer = temp_buffer_allocator.get(); aclTensor* tmp_tensor = create_acl_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, dst->nb, GGML_MAX_DIMS); @@ -332,7 +340,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -343,7 +352,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor, type_mapping(dst->type), acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, main_stream)); @@ -375,7 +385,8 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream stream = ctx.stream(); @@ -408,7 +419,8 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t ne[] = {n_groups, N}; size_t nb[] = {type_size, type_size * n_groups}; size_t n_bytes = N * n_groups; - void* buffer = ctx.alloc_buffer(dst, n_bytes * 2); + ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2); + void* buffer = temp_buffer_allocator.get(); aclTensor* acl_mean_out = create_acl_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); aclTensor* acl_rstd_out = create_acl_tensor( @@ -419,7 +431,8 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { acl_mean_out, acl_rstd_out, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream stream = ctx.stream(); @@ -470,7 +483,8 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, stream)); ACL_CHECK(aclDestroyTensor(acl_src0)); @@ -478,7 +492,8 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK( aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, stream)); @@ -509,7 +524,8 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { type_mapping(src->type), acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream stream = ctx.stream(); @@ -540,7 +556,8 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize( acl_src, output_size_array, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK( @@ -551,9 +568,9 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, ACL_CHECK(aclDestroyTensor(acl_dst)); } -static void aclnn_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst, - aclTensor* acl_src, aclTensor* acl_dst, int64_t* paddings, - float value = 0.0f) { +static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, int64_t* paddings, + float value = 0.0f) { aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2); aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); @@ -565,7 +582,8 @@ static void aclnn_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst, acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream stream = ctx.stream(); @@ -588,7 +606,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t paddings[] = { 0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1], 0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]}; - aclnn_pad(ctx, dst, acl_src, acl_dst, paddings); + aclnn_pad(ctx, acl_src, acl_dst, paddings); ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyTensor(acl_src)); @@ -654,7 +672,8 @@ void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, stream)); @@ -693,8 +712,8 @@ void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1]; } - void* buffer = - ctx.alloc_buffer(dst, ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]); + ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]); + void* buffer = temp_buffer_allocator.get(); aclTensor* tmp_tensor = create_acl_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb, GGML_MAX_DIMS, ACL_FORMAT_NCHW); @@ -702,7 +721,7 @@ void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // pad: see padding in ggml_cann_pad() int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0}; float value = -FLT_MAX; - aclnn_pad(ctx, dst, acl_src, tmp_tensor, paddings, value); + aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value); // max_pool std::vector kernel_dims = {k1, k0}; @@ -727,7 +746,8 @@ void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations, ceil_mode, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnMaxPool(workspaceAddr, workspaceSize, executor, stream)); @@ -742,8 +762,7 @@ void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyIntArray(dilations)); } -static void cann_copy(ggml_backend_cann_context& ctx, ggml_tensor* dst, - aclTensor* acl_src, aclTensor* acl_dst) { +static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -752,7 +771,8 @@ static void cann_copy(ggml_backend_cann_context& ctx, ggml_tensor* dst, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream stream = ctx.stream(); @@ -765,6 +785,15 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src = create_acl_tensor(src); aclTensor* acl_dst = create_acl_tensor(dst); + ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + src->extra = src_extra_allocator.get(); + dst->extra = dst_extra_allocator.get(); + ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + // TODO: simplefify if (src->type==GGML_TYPE_F16) { if (dst->type==GGML_TYPE_Q8_0) { @@ -776,7 +805,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } if (dst->type==GGML_TYPE_F16) { if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, dst, acl_src, acl_dst); + cann_copy(ctx, acl_src, acl_dst); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); return; @@ -802,7 +831,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } if (dst->type==GGML_TYPE_F32) { if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, dst, acl_src, acl_dst); + cann_copy(ctx, acl_src, acl_dst); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); return; @@ -840,7 +869,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } if (dst->type==GGML_TYPE_F32) { if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, dst, acl_src, acl_dst); + cann_copy(ctx, acl_src, acl_dst); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); return; @@ -868,7 +897,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } if (dst->type==GGML_TYPE_F16) { if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, dst, acl_src, acl_dst); + cann_copy(ctx, acl_src, acl_dst); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); return; @@ -895,7 +924,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } else { if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, dst, acl_src, acl_dst); + cann_copy(ctx, acl_src, acl_dst); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); return; @@ -919,31 +948,24 @@ aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize, } #endif -static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, ggml_tensor* dst, +static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, size_t n_bytes, int64_t* ne, int64_t dims, aclDataType type, size_t type_size) { - int64_t elements = 1; - for (int i = 0; i < dims; i++) { - elements *= ne[i]; - } - size_t n_bytes = elements * type_size; - size_t nb[GGML_MAX_DIMS]; nb[0] = type_size; for (int i = 1; i < dims; i++) { nb[i] = nb[i - 1] * ne[i - 1]; } - void* buffer = ctx.alloc_buffer(dst, n_bytes); ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream())); aclTensor* zero = create_acl_tensor(buffer, type, type_size, ne, nb, dims); return zero; } -static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, ggml_tensor* dst, +static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer, size_t n_bytes, int64_t* ne, int64_t dims, aclDataType type, size_t type_size, float value = 1.0f) { - aclTensor* acl_tensor = aclnn_zero(ctx, dst, ne, dims, type, type_size); + aclTensor* acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size); float alpha_host = 1.0f; aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT); aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT); @@ -956,7 +978,8 @@ static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, ggml_tensor* dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK( aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream())); @@ -979,19 +1002,24 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclOpExecutor* executor; void* workspaceAddr = nullptr; + size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src); + ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); + aclTensor* acl_gamma = aclnn_ones( - ctx, dst, src->ne, 1, type_mapping(src->type), ggml_element_size(src)); + ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, type_mapping(src->type), ggml_element_size(src)); - int64_t rstd_ne[] = {1, src->ne[1], src->ne[2], src->ne[3]}; + size_t zero_tensor_n_bytes = src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src); + ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes); aclTensor* acl_rstd = - aclnn_zero(ctx, dst, rstd_ne, GGML_MAX_DIMS, type_mapping(src->type), + aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes, src->ne, GGML_MAX_DIMS, type_mapping(src->type), ggml_element_size(src)); ACL_CHECK(aclnnRmsNormGetWorkspaceSize( acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK( @@ -1013,8 +1041,11 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, const int n_past = ((int32_t*)dst->op_params)[0]; + size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *src->ne[3] * ggml_element_size(src); + ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); + aclTensor* mask_tensor = - aclnn_ones(ctx, dst, src->ne, GGML_MAX_DIMS, type_mapping(src->type), + aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, GGML_MAX_DIMS, type_mapping(src->type), ggml_element_size(src), value); uint64_t workspaceSize = 0; @@ -1024,7 +1055,8 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK( @@ -1033,7 +1065,8 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream())); @@ -1045,7 +1078,8 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK( aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); @@ -1057,8 +1091,7 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, } static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, aclDataType cast_data_type, - ggml_tensor* bind_tensor) { + aclTensor* acl_dst, aclDataType cast_data_type) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1067,15 +1100,15 @@ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, stream)); } static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* new_dim, uint64_t dims, - ggml_tensor* bind_tensor) { + aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) { aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims); uint64_t workspaceSize = 0; @@ -1085,7 +1118,8 @@ static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK( @@ -1152,8 +1186,8 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // Calculate im2col. // If dst is f16, tmp_buffer is f32, we need alloc src.typesize * // dst.elemcount. - void* tmp_im2col_buffer = - ctx.alloc_buffer(dst, ggml_nelements(dst) * ggml_element_size(src1)); + ggml_cann_pool_alloc im2col_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1)); + void* tmp_im2col_buffer = im2col_allocator.get(); aclTensor* tmp_im2col_tensor = create_acl_tensor( tmp_im2col_buffer, type_mapping(src1->type), ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND); @@ -1177,15 +1211,18 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnIm2col(workspaceAddr, workspaceSize, executor, stream)); // Cast if dst is f16. aclTensor* tmp_cast_tensor = nullptr; + ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool()); if (src1->type != dst->type) { - void* tmp_cast_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst)); + tmp_cast_allocator.alloc(ggml_nbytes(dst)); + void* tmp_cast_buffer = tmp_cast_allocator.get(); size_t temp_cast_nb[GGML_MAX_DIMS - 1]; temp_cast_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { @@ -1196,7 +1233,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { tmp_cast_buffer, type_mapping(dst->type), ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND); aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, - type_mapping(dst->type), dst); + type_mapping(dst->type)); } // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] @@ -1207,9 +1244,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t permute_dim[] = {0, 2, 1}; if (src1->type != dst->type) { - aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3, dst); + aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); } else { - aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3, dst); + aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); } // release @@ -1223,8 +1260,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyIntArray(strides)); } -static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src, - ggml_tensor* bind_tensor) { +static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1232,7 +1268,8 @@ static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK( aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK( @@ -1240,8 +1277,7 @@ static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src, } static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, - float scale, aclTensor* acl_dst, bool inplace, - ggml_tensor* bind_tensor) { + float scale, aclTensor* acl_dst, bool inplace) { aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); uint64_t workspaceSize = 0; @@ -1252,7 +1288,8 @@ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor, @@ -1262,7 +1299,8 @@ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, @@ -1274,7 +1312,7 @@ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, } static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_other, ggml_tensor* bind_tensor) { + aclTensor* acl_other) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1282,7 +1320,8 @@ static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK( @@ -1290,8 +1329,7 @@ static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src } static void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_other, aclTensor* acl_dst, - ggml_tensor* bind_tensor) { + aclTensor* acl_other, aclTensor* acl_dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1299,14 +1337,15 @@ static void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_ ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream())); } static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, ggml_tensor* bind_tensor) { + aclTensor* acl_dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1314,14 +1353,15 @@ static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK( aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream())); } static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, ggml_tensor* bind_tensor) { + aclTensor* acl_dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1329,7 +1369,8 @@ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK( aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream())); @@ -1356,19 +1397,19 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, int64_t tmp_arange_ne[] = {half}; size_t tmp_arange_nb[] = {sizeof(dst->type)}; - void* tmp_arange_buffer = ctx.alloc_buffer(dst, half * sizeof(dst->type)); + ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type)); + void* tmp_arange_buffer = arange_allocator.get(); aclTensor* tmp_arange_tensor = create_acl_tensor( tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange, - dst); + aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange); // freq float freq_param = -logf(max_period) / half; bool inplace = true; - aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace, dst); - aclnn_exp(ctx, tmp_arange_tensor, dst); + aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace); + aclnn_exp(ctx, tmp_arange_tensor); // permute: src [0,1,2,3]->[0,1,3,2] int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]}; @@ -1378,13 +1419,14 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; } - void* tmp_permute_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src)); + ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src)); + void* tmp_permute_buffer = permute_allocator.get(); aclTensor* tmp_permute_tenosr = create_acl_tensor( tmp_permute_buffer, type_mapping(src->type), ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); int64_t permute_dim[] = {0, 1, 3, 2}; int64_t num_dims = 4; - aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims, dst); + aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims); // timestep * freq int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2], @@ -1398,38 +1440,38 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, int mul_nelements = src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3]; - void* tmp_mul_buffer = - ctx.alloc_buffer(dst, mul_nelements * ggml_type_size(src->type)); + ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type)); + void* tmp_mul_buffer = mul_allocator.get(); aclTensor* tmp_mul_tensor = create_acl_tensor( tmp_mul_buffer, type_mapping(src->type), ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_noinplcace_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, - tmp_mul_tensor, dst); + tmp_mul_tensor); // cos - void* tmp_cos_buffer = - ctx.alloc_buffer(dst, mul_nelements * ggml_type_size(src->type)); + ggml_cann_pool_alloc cos_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type)); + void* tmp_cos_buffer = cos_allocator.get(); aclTensor* tmp_cos_tensor = create_acl_tensor( tmp_cos_buffer, type_mapping(dst->type), ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor, dst); + aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor); // sin - void* tmp_sin_buffer = - ctx.alloc_buffer(dst, mul_nelements * ggml_type_size(src->type)); + ggml_cann_pool_alloc sin_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type)); + void* tmp_sin_buffer = sin_allocator.get(); aclTensor* tmp_sin_tensor = create_acl_tensor( tmp_sin_buffer, type_mapping(dst->type), ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor, dst); + aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor); // concat int64_t concat_dim = 3; aclTensor* acl_dst = create_acl_tensor(dst); aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor}; aclTensorList* tensorList = aclCreateTensorList(tensors, 2); - aclnn_concat(ctx, tensorList, acl_dst, concat_dim, dst); + aclnn_concat(ctx, tensorList, acl_dst, concat_dim); // release // segmentation fault when delete both tensorList and his elements. @@ -1442,7 +1484,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, } static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, - aclTensor* acl_dst, ggml_tensor* bind_tensor) { + aclTensor* acl_dst) { // fill acl_dst with scalar value. auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); @@ -1454,7 +1496,8 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize( acl_dst, acl_scalar, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor, @@ -1463,8 +1506,7 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, } static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, - aclTensor* acl_dst, aclTensor* acl_exp, - ggml_tensor* bind_tensor) { + aclTensor* acl_dst, aclTensor* acl_exp) { // acl_dst = acl_dst^acl_exp uint64_t workspaceSize = 0; @@ -1474,7 +1516,8 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize( acl_dst, acl_exp, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize, @@ -1496,10 +1539,8 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); // init arange - void* tmp_arange_buffer = - ctx.alloc_buffer(dst, ne2_ne3 * ggml_type_size(dst->type)); - size_t memset_size = ne2_ne3 * ggml_type_size(dst->type); - ACL_CHECK(aclrtMemset(tmp_arange_buffer, memset_size, 0, memset_size)); + ggml_cann_pool_alloc arange_allocator(ctx.pool(), ne2_ne3 * ggml_type_size(dst->type)); + void* tmp_arange_buffer = arange_allocator.get(); // arange1: [1, ..., n_heads_log2_floor+1) float start = 1; @@ -1513,8 +1554,7 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange, - dst); + aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange); aclTensor* tmp_arange2_tensor = nullptr; if (n_heads_log2_floor < ne2_ne3) { @@ -1531,19 +1571,19 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, type_mapping(dst->type), ggml_type_size(dst->type), tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step, - n_elements_arange, dst); + n_elements_arange); } // init mk_base - void* tmp_mk_base_buffer = - ctx.alloc_buffer(dst, ne2_ne3 * ggml_type_size(dst->type)); + ggml_cann_pool_alloc mk_base_allocator(ctx.pool(), ne2_ne3 * ggml_type_size(dst->type)); + void* tmp_mk_base_buffer = mk_base_allocator.get(); int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor}; size_t tmp_mk_base1_nb[] = {sizeof(dst->type)}; aclTensor* tmp_mk_base1_tensor = create_acl_tensor( tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor, dst); + aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor); aclTensor* tmp_mk_base2_tensor = nullptr; if (n_heads_log2_floor < ne2_ne3) { @@ -1553,7 +1593,7 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, (char*)tmp_mk_base_buffer + n_heads_log2_floor * ggml_type_size(dst->type), type_mapping(dst->type), ggml_type_size(dst->type), tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor, dst); + aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor); } // init mk @@ -1565,7 +1605,7 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* tmp_arange_tensor = create_acl_tensor( tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor, dst); + aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor); // reshape mk int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]}; @@ -1585,15 +1625,15 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, for (int i = 1; i < GGML_MAX_DIMS; i++) { tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1]; } - void* tmp_output_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst)); + ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst)); + void* tmp_output_buffer = output_allocator.get(); aclTensor* tmp_output_tensor = create_acl_tensor( tmp_output_buffer, type_mapping(dst->type), ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_noinplcace_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor, - dst); + aclnn_noinplcace_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor); // add - aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst, dst); + aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst); ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor)); ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor)); @@ -1627,13 +1667,13 @@ void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t tmp_position_ne[] = {ne0, 1, 1, 1}; size_t tmp_position_nb[] = {sizeof(dst->type)}; - void* tmp_position_buffer = ctx.alloc_buffer(dst, ne0 * sizeof(dst->type)); + ggml_cann_pool_alloc position_allocator(ctx.pool(), ne0 * sizeof(dst->type)); + void* tmp_position_buffer = position_allocator.get(); aclTensor* tmp_position_tensor = create_acl_tensor( tmp_position_buffer, type_mapping(dst->type), ggml_type_size(dst->type), tmp_position_ne, tmp_position_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_arange(ctx, tmp_position_tensor, start, stop, step, n_elements_arange, - dst); + aclnn_arange(ctx, tmp_position_tensor, start, stop, step, n_elements_arange); // call alibi aclTensor* acl_src = create_acl_tensor(src); @@ -1650,7 +1690,7 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } static void aclnn_inplace_add(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, ggml_tensor* bind_tensor) { + aclTensor* acl_dst) { aclScalar* alpha = nullptr; float alphaValue = 1.0f; alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); @@ -1662,7 +1702,8 @@ static void aclnn_inplace_add(ggml_backend_cann_context& ctx, aclTensor* acl_src ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -1673,8 +1714,7 @@ static void aclnn_inplace_add(ggml_backend_cann_context& ctx, aclTensor* acl_src } static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src, - int64_t dim, aclTensor* acl_dst, - ggml_tensor* bind_tensor) { + int64_t dim, aclTensor* acl_dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -1684,7 +1724,8 @@ static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream stream = ctx.stream(); @@ -1708,7 +1749,8 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); size_t n_bytes = ggml_nbytes(src0); - void* input_mul_scale_buffer = ctx.alloc_buffer(dst, n_bytes); + ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes); + void* input_mul_scale_buffer = mul_scale_allocator.get(); aclTensor* acl_input_mul_scale_tensor = create_acl_tensor( input_mul_scale_buffer, ACL_FLOAT, @@ -1717,12 +1759,12 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_MAX_DIMS); bool inplace = false; - aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace, - dst); + aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace); // mask aclTensor* acl_src1_fp32_tensor = nullptr; aclTensor* tmp_mask_tensor = nullptr; + ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool()); if (src1) { const bool use_f16 = src1->type == GGML_TYPE_F16; if (use_f16) { @@ -1733,15 +1775,16 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1]; } - void* src1_fp32_buffer = ctx.alloc_buffer(dst, n_bytes); - acl_src1_fp32_tensor = create_acl_tensor(src1_fp32_buffer, - ACL_FLOAT, + src1_fp32_allocator.alloc(n_bytes); + void* src1_fp32_buffer = src1_fp32_allocator.get(); + acl_src1_fp32_tensor = create_acl_tensor(src1_fp32_buffer, + ACL_FLOAT, sizeof(float), src1->ne, src1_fp32_nb, GGML_MAX_DIMS); aclTensor* acl_src1 = create_acl_tensor(src1); - aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT, dst); + aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT); ACL_CHECK(aclDestroyTensor(acl_src1)); } @@ -1768,7 +1811,8 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const size_t src_nb0 = src0->nb[0]; n_bytes = ggml_nbytes(dst); - void* output_buffer = ctx.alloc_buffer(dst, n_bytes); + ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes); + void* output_buffer = output_allocator.get(); aclTensor* alibi_output_tensor = create_acl_tensor( output_buffer, ACL_FLOAT, @@ -1779,11 +1823,11 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // slope = 1.0 if (tmp_mask_tensor) { aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor, - alibi_output_tensor, dst); + alibi_output_tensor); } else { aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor, - alibi_output_tensor, dst); + alibi_output_tensor); } } @@ -1802,11 +1846,11 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } // softmax - aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst, dst); + aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst); ACL_CHECK(aclDestroyTensor(alibi_output_tensor)); } else { - aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst, dst); + aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst); } ACL_CHECK(aclDestroyTensor(acl_src0)); @@ -1821,6 +1865,19 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src0 = dst->src[0]; ggml_tensor* src1 = dst->src[1]; + ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + src0->extra = src0_extra_allocator.get(); + src1->extra = src1_extra_allocator.get(); + dst->extra = dst_extra_allocator.get(); + ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + switch (src0->type) { case GGML_TYPE_F32: aclrtlaunch_ascendc_get_row_f32( @@ -1864,7 +1921,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst, int64_t dim, int64_t repeats, - int64_t output_size, ggml_tensor* bind_tensor) { + int64_t output_size) { // each elem in acl_src will repeat. repeat number is `repeats`, repeats dim // is `dim`. @@ -1878,7 +1935,8 @@ static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, aclTensor* a &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -1900,15 +1958,20 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst, cube_math_type, &workspaceSize, &executor)); - if (workspaceSize > ctx.aclnn_workspace_size) { - aclrtFree(ctx.aclnn_buffer); - ACL_CHECK(aclrtMalloc(&ctx.aclnn_buffer, workspaceSize, - ACL_MEM_MALLOC_HUGE_FIRST)); - ctx.aclnn_workspace_size = workspaceSize; + // if (workspaceSize > ctx.aclnn_workspace_size) { + // aclrtFree(ctx.aclnn_buffer); + // ACL_CHECK(aclrtMalloc(&ctx.aclnn_buffer, workspaceSize, + // ACL_MEM_MALLOC_HUGE_FIRST)); + // ctx.aclnn_workspace_size = workspaceSize; + // } + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); - ACL_CHECK(aclnnMatmul(ctx.aclnn_buffer, workspaceSize, executor, + ACL_CHECK(aclnnMatmul(workspaceAddr, workspaceSize, executor, main_stream)); } @@ -1939,31 +2002,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, ggml_tensor* ds void* acl_repeat_weight_buffer = nullptr; aclTensor* acl_repeat_weight_tensor = nullptr; aclTensor* acl_weight_tensor =create_acl_tensor(src0); + ggml_cann_pool_alloc repeat_weight_allocator(ctx.pool()); + ggml_cann_pool_alloc repeat_weight2_allocator(ctx.pool()); if (repeat_dim2 > 1) { weight_repeat_ne[3] = src0->ne[3]; - acl_repeat_weight_buffer = ctx.alloc_buffer(dst, - ggml_nelements(src0) + repeat_weight_allocator.alloc( ggml_nelements(src0) *repeat_dim2 *ggml_type_size(src0->type)); - acl_repeat_weight_tensor = create_acl_tensor(acl_repeat_weight_buffer, - type_mapping(src0->type), - ggml_type_size(src0->type), - weight_repeat_ne, - weight_repeat_nb, + acl_repeat_weight_buffer = repeat_weight_allocator.get(); + acl_repeat_weight_tensor = create_acl_tensor(acl_repeat_weight_buffer, + type_mapping(src0->type), + ggml_type_size(src0->type), + weight_repeat_ne, + weight_repeat_nb, GGML_MAX_DIMS); int64_t dim = 1; int64_t output_size = src0->ne[2]*repeat_dim2; aclnn_repeat_interleave(ctx, acl_weight_tensor, acl_repeat_weight_tensor, dim, repeat_dim2, - output_size, dst); + output_size); } if (repeat_dim3 > 1) { weight_repeat_ne[3] = src0->ne[3]*repeat_dim3; - acl_repeat_weight_buffer = ctx.alloc_buffer(dst, - ggml_nelements(src0) + repeat_weight2_allocator.alloc(ggml_nelements(src0) *repeat_dim2*repeat_dim3 *ggml_type_size(src0->type)); + acl_repeat_weight_buffer = repeat_weight2_allocator.get(); aclTensor* acl_repeat_weight_tensor2 = create_acl_tensor( acl_repeat_weight_buffer, type_mapping(src0->type), @@ -1976,12 +2041,12 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, ggml_tensor* ds if (acl_repeat_weight_tensor==nullptr) { aclnn_repeat_interleave(ctx, acl_weight_tensor, acl_repeat_weight_tensor2, dim, repeat_dim3, - output_size, dst); + output_size); } else { aclnn_repeat_interleave(ctx, acl_repeat_weight_tensor, acl_repeat_weight_tensor2, dim, repeat_dim3, - output_size, dst); + output_size); } ACL_CHECK(aclDestroyTensor(acl_repeat_weight_tensor)); ACL_CHECK(aclDestroyTensor(acl_repeat_weight_tensor2)); @@ -2045,8 +2110,8 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* if (src1->type != GGML_TYPE_F16) { aclTensor* acl_src1_tensor = create_acl_tensor(src1); - input_buffer = - ctx.alloc_buffer(dst, ggml_nelements(src1) * input_elem_size); + ggml_cann_pool_alloc input_alloctor(ctx.pool(), ggml_nelements(src1) * input_elem_size); + input_buffer = input_alloctor.get(); int64_t* input_cast_ne = src1->ne; size_t input_cast_nb[GGML_MAX_DIMS]; @@ -2058,7 +2123,7 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* aclTensor* acl_input_tensor = create_acl_tensor(input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS); - aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16, dst); + aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); ACL_CHECK(aclDestroyTensor(acl_input_tensor)); ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); } else { @@ -2069,8 +2134,8 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* size_t output_elem_size = sizeof(uint16_t); int64_t output_ne[] = {dst->ne[0], dst->ne[1]}; size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]}; - void* output_buffer = - ctx.alloc_buffer(dst, ggml_nelements(dst) * output_elem_size); + ggml_cann_pool_alloc output_alloctor(ctx.pool(), ggml_nelements(dst) * output_elem_size); + void* output_buffer = output_alloctor.get(); size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1]; // aclnn @@ -2105,7 +2170,8 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* &workspaceSize, &executor)); if (workspaceSize > 0 && workspaceAddr == nullptr) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } ACL_CHECK(aclnnWeightQuantBatchMatmulV2( @@ -2130,7 +2196,7 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* create_acl_tensor(output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS); aclTensor* acl_dst_tensor = create_acl_tensor(dst); - aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT, dst); + aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT); ACL_CHECK(aclDestroyTensor(acl_output_tensor)); ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); @@ -2156,8 +2222,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* shifts, int64_t* dims, - ggml_tensor* bind_tensor) { + aclTensor* acl_dst, int64_t* shifts, int64_t* dims) { aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1); aclIntArray* acl_dims = aclCreateIntArray(dims, 1); @@ -2169,7 +2234,8 @@ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -2233,8 +2299,9 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // cast position: i32 to fp32 aclTensor* acl_position_tensor = create_acl_tensor(src1); - void* position_cast_buffer = ctx.alloc_buffer(dst, ggml_nelements(src1) + ggml_cann_pool_alloc position_cast_allocator(ctx.pool(), ggml_nelements(src1) * sizeof(float_t)); + void* position_cast_buffer = position_cast_allocator.get(); int64_t* position_cast_ne = src1->ne; size_t position_cast_nb[GGML_MAX_DIMS]; position_cast_nb[0] = sizeof(float_t); @@ -2248,17 +2315,23 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { position_cast_nb, GGML_MAX_DIMS); - aclnn_cast(ctx, acl_position_tensor, acl_postion_cast_tensor, ACL_FLOAT, - dst); + aclnn_cast(ctx, acl_position_tensor, acl_postion_cast_tensor, ACL_FLOAT); - // init cos/sin cache - void* sin_buffer = ctx.alloc_buffer(dst, src0->ne[0] * src0->ne[2] + // init cos/sin cache, + ggml_cann_pool_alloc sin_allocator(ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t)); - void* cos_buffer = ctx.alloc_buffer(dst, src0->ne[0] * src0->ne[2] + ggml_cann_pool_alloc cos_allocator(ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t)); + void* sin_buffer = sin_allocator.get(); + void* cos_buffer = cos_allocator.get(); - aclrtlaunch_ascendc_rope_init_cache(param.position_ne[0], ctx.stream(), - position_cast_buffer, + ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + src0->extra = src0_extra_allocator.get(); + ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + + aclrtlaunch_ascendc_rope_init_cache(param.position_ne[0], ctx.stream(), + position_cast_buffer, sin_buffer, cos_buffer, param_buffer, ((ggml_tensor*)src0->extra)->ne); @@ -2287,14 +2360,18 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // roll input void* input_roll_buffer; aclTensor* acl_minus_one_tensor; + void* minus_one_scale_buffer = nullptr; + ggml_cann_pool_alloc minus_one_allocator(ctx.pool()); + ggml_cann_pool_alloc roll_allocator(ctx.pool()); if (is_glm) { // TODO GGML_ASSERT(false); } else if (!is_neox) { // roll input: [q0,q1,q2,...] -> [q1,q0,q3,q2...] - input_roll_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src0)); - int64_t input_roll_ne[4] = {2, src0->ne[1]*(src0->ne[0]/2), src0->ne[2], + roll_allocator.alloc(ggml_nbytes(src0)); + input_roll_buffer = roll_allocator.get(); + int64_t input_roll_ne[4] = {2, src0->ne[1]*(src0->ne[0]/2), src0->ne[2], src0->ne[3]}; size_t input_roll_nb[GGML_MAX_DIMS]; input_roll_nb[0] = ggml_type_size(src0->type); @@ -2318,14 +2395,12 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t shifts[] = {1}; int64_t dims[] = {3}; - aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims, - dst); + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); ACL_CHECK(aclDestroyTensor(acl_input_tensor)); // init [-1, 1, -1, 1, ...] - void* minus_one_scale_buffer = ctx.alloc_buffer(dst, sizeof(int64_t) - * src0->ne[0]); + ACL_CHECK(aclrtMalloc(&minus_one_scale_buffer, sizeof(int64_t) * src0->ne[0], ACL_MEM_MALLOC_HUGE_FIRST)); int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; size_t minus_one_nb[GGML_MAX_DIMS]; minus_one_nb[0] = sizeof(int64_t); @@ -2349,7 +2424,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } else { // roll input: [q0,q1,q2,...] -> [q_half,q_half+1,..., q0,q1,...q_half-1] - input_roll_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src0)); + roll_allocator.alloc(ggml_nbytes(src0)); + input_roll_buffer = roll_allocator.get(); aclTensor* acl_input_roll_tensor = create_acl_tensor( input_roll_buffer, type_mapping(src0->type), @@ -2360,14 +2436,12 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t shifts[] = {src0->ne[0] / 2}; int64_t dims[] = {3}; - aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims, - dst); + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); ACL_CHECK(aclDestroyTensor(acl_input_tensor)); // init [-1, -1, -1, 1, 1,1,...] - void* minus_one_scale_buffer = ctx.alloc_buffer(dst, sizeof(int64_t) - * src0->ne[0]); + ACL_CHECK(aclrtMalloc(&minus_one_scale_buffer, sizeof(int64_t) * src0->ne[0], ACL_MEM_MALLOC_HUGE_FIRST)); int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; size_t minus_one_nb[GGML_MAX_DIMS]; minus_one_nb[0] = sizeof(int64_t); @@ -2395,8 +2469,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } // input * scale - void* input_roll_mul_scale_buffer = ctx.alloc_buffer(dst, - ggml_nbytes(src0)); + ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), ggml_nbytes(src0)); + void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get(); size_t input_nb[GGML_MAX_DIMS]; input_nb[0] = ggml_type_size(src0->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { @@ -2414,19 +2488,18 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); - aclnn_noinplcace_mul(ctx, acl_input_roll_reshape_tensor, - acl_minus_one_tensor, acl_input_roll_mul_scale_tensor, - dst); - + aclnn_noinplcace_mul(ctx, acl_input_roll_reshape_tensor, + acl_minus_one_tensor, acl_input_roll_mul_scale_tensor); + // output aclTensor* acl_src0 = create_acl_tensor(src0); aclTensor* acl_dst = create_acl_tensor(dst); void* output_fp32_buffer; if (src0->type == GGML_TYPE_F32) { - aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor, dst); + aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor); aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, - acl_sin_reshape_tensor, dst); - aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst, dst); + acl_sin_reshape_tensor); + aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst); // TODO: zeta scaling for xPos // TODO: ne0 != n_dims in mode2 } @@ -2436,39 +2509,38 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; } - void* input_fp32_buffer1 = ctx.alloc_buffer(dst, ggml_nelements(dst) - * sizeof(float_t)); - aclTensor* input_fp32_tensor1 = create_acl_tensor(input_fp32_buffer1, - ACL_FLOAT, - sizeof(float_t), - dst->ne, - input_fp32_nb, + ggml_cann_pool_alloc fp32_allocator1(ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* input_fp32_buffer1 = fp32_allocator1.get(); + aclTensor* input_fp32_tensor1 = create_acl_tensor(input_fp32_buffer1, + ACL_FLOAT, + sizeof(float_t), + dst->ne, + input_fp32_nb, GGML_MAX_DIMS); - void* input_fp32_buffer2 = ctx.alloc_buffer(dst, ggml_nelements(dst) - * sizeof(float_t)); - aclTensor* input_fp32_tensor2 = create_acl_tensor(input_fp32_buffer2, - ACL_FLOAT, - sizeof(float_t), - dst->ne, - input_fp32_nb, + ggml_cann_pool_alloc fp32_allocator2(ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* input_fp32_buffer2 = fp32_allocator2.get(); + aclTensor* input_fp32_tensor2 = create_acl_tensor(input_fp32_buffer2, + ACL_FLOAT, + sizeof(float_t), + dst->ne, + input_fp32_nb, GGML_MAX_DIMS); - output_fp32_buffer = ctx.alloc_buffer(dst, ggml_nelements(dst) - * sizeof(float_t)); - aclTensor* output_fp32_tensor = create_acl_tensor(output_fp32_buffer, + ggml_cann_pool_alloc fp32_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + output_fp32_buffer = fp32_allocator.get(); + aclTensor* output_fp32_tensor = create_acl_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne, input_fp32_nb, GGML_MAX_DIMS); aclnn_noinplcace_mul(ctx, acl_src0, acl_cos_reshape_tensor, - input_fp32_tensor1, dst); + input_fp32_tensor1); aclnn_noinplcace_mul(ctx, acl_input_roll_mul_scale_tensor, - acl_sin_reshape_tensor, input_fp32_tensor2, - dst); + acl_sin_reshape_tensor, input_fp32_tensor2); aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, - output_fp32_tensor, dst); - aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16, dst); + output_fp32_tensor); + aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16); ACL_CHECK(aclDestroyTensor(input_fp32_tensor1)); ACL_CHECK(aclDestroyTensor(input_fp32_tensor2)); @@ -2484,4 +2556,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor)); ACL_CHECK(aclDestroyTensor(acl_src0)); ACL_CHECK(aclDestroyTensor(acl_dst)); + + if (minus_one_scale_buffer != nullptr) + ACL_CHECK(aclrtFree(minus_one_scale_buffer)); } diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 469c89336418c8..6a05e3fedb1eb1 100644 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -108,7 +108,8 @@ void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -139,7 +140,8 @@ void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); @@ -169,7 +171,8 @@ void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(dst, workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } aclrtStream main_stream = ctx.stream(); diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 810a0504bdb467..174d5ac14bbb3f 100644 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "../include/ggml-cann.h" #include "../include/ggml.h" @@ -49,6 +50,56 @@ const ggml_cann_device_info& ggml_cann_info(); void ggml_cann_set_device(int32_t device); int32_t ggml_cann_get_device(); +struct ggml_cann_pool { + virtual ~ggml_cann_pool() = default; + + virtual void * alloc(size_t size, size_t * actual_size) = 0; + virtual void free(void * ptr, size_t size) = 0; +}; + +struct ggml_cann_pool_alloc { + ggml_cann_pool * pool = nullptr; + void * ptr = nullptr; + size_t actual_size = 0; + + ggml_cann_pool_alloc() = default; + + explicit ggml_cann_pool_alloc(ggml_cann_pool & pool) : pool(&pool) { + } + + ggml_cann_pool_alloc(ggml_cann_pool & pool, size_t size) : pool(&pool) { + alloc(size); + } + + ~ggml_cann_pool_alloc() { + if (ptr != nullptr) { + pool->free(ptr, actual_size); + } + } + + // size is in number of elements + void * alloc(size_t size) { + GGML_ASSERT(pool != nullptr); + GGML_ASSERT(ptr == nullptr); + ptr = pool->alloc(size, &this->actual_size); + return ptr; + } + + void * alloc(ggml_cann_pool & pool, size_t size) { + this->pool = &pool; + return alloc(size); + } + + void * get() { + return ptr; + } + + ggml_cann_pool_alloc(const ggml_cann_pool_alloc &) = delete; + ggml_cann_pool_alloc(ggml_cann_pool_alloc &&) = delete; + ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc &) = delete; + ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc &&) = delete; +}; + struct ggml_backend_cann_context { int32_t device; std::string name; @@ -59,9 +110,6 @@ struct ggml_backend_cann_context { aclrtStream streams[GGML_CANN_MAX_STREAMS] = {{nullptr}}; - // bind temp buffers to stream. Free after sync. - std::multimap buffers[GGML_CANN_MAX_STREAMS]; - explicit ggml_backend_cann_context(int device) : device(device), name(GGML_CANN_NAME + std::to_string(device)) {} @@ -72,57 +120,11 @@ struct ggml_backend_cann_context { for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) { if (streams[i] != nullptr) { ACL_CHECK(aclrtDestroyStream(streams[i])); - // Buffers should have been freed. - GGML_ASSERT(buffers[i].size() == 0); } } aclrtFree(aclnn_buffer); } - void* alloc_buffer(ggml_tensor* dst, size_t size, int stream) { - void* buffer; - ACL_CHECK(aclrtMalloc(&buffer, size, ACL_MEM_MALLOC_HUGE_FIRST)); - bind_buffer(dst, buffer, stream); - return buffer; - } - - void* alloc_buffer(ggml_tensor* dst, size_t size) { - return alloc_buffer(dst, size, 0); - } - - // Free all buffers bind to all streams. - void free_device_buffers() { - for (int i = 0; i < GGML_CANN_MAX_STREAMS; i++) { - for (auto& it : buffers[i]) { - ACL_CHECK(aclrtFree(it.second)); - } - buffers[i].clear(); - } - } - - // Free all buffers bind to stream. - void free_stream_buffers(int stream) { - for (auto& it : buffers[stream]) { - ACL_CHECK(aclrtFree(it.second)); - } - buffers[stream].clear(); - } - - // Free all buffers belong to dst. - // Remove it from stream buffers to avoid double free. - void free_tensor_buffers(ggml_tensor* dst) { - // ggml_tensor.extra means which stream are tensor in. - for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) { - if (streams[i] != nullptr) { - for (auto pos = buffers[i].equal_range(dst); - pos.first != pos.second; ++pos.first) { - ACL_CHECK(aclrtFree(pos.first->second)); - } - buffers[i].erase(dst); - } - } - } - aclrtStream stream(int stream) { if (streams[stream] == nullptr) { ggml_cann_set_device(device); @@ -131,15 +133,18 @@ struct ggml_backend_cann_context { return streams[stream]; } - // All temp buffers should bind to stream and the dst tensor. - // It will be free if: - // 1. dst tensor are no longer used any more. - // 2. after stream sync. - void bind_buffer(ggml_tensor* dst, void* buf, int stream) { - buffers[stream].insert(std::make_pair(dst, buf)); - } - aclrtStream stream() { return stream(0); } + + std::unique_ptr mem_pool; + + static std::unique_ptr new_pool_for_device(int device); + + ggml_cann_pool & pool() { + if(mem_pool == nullptr) { + mem_pool = new_pool_for_device(device); + } + return *mem_pool; + } }; #endif // CANN_COMMON_H diff --git a/src/llama.cpp b/src/llama.cpp index f49a4e186c3dca..96395409aa9a4b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -14821,6 +14821,8 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { llama_graph_compute(lctx, gf, lctx.cparams.n_threads); need_reserve = true; + + LLAMA_LOG_INFO("\n\n\n\nkv cache updated!!!!!\n\n\n\n"); } {