Skip to content

Commit

Permalink
fix get row with batch and view
Browse files Browse the repository at this point in the history
  • Loading branch information
hipudding committed May 13, 2024
1 parent 5430a30 commit 4e7889e
Show file tree
Hide file tree
Showing 7 changed files with 200 additions and 146 deletions.
87 changes: 56 additions & 31 deletions ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,26 @@ struct ggml_backend_cann_buffer_context {
int32_t device;
void* dev_ptr = nullptr;
std::string name;
std::vector<void*> tensor_transform_buffers;
std::vector<void*> dev_extra_ptrs;

ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
: device(device),
dev_ptr(dev_ptr),
name(GGML_CANN_NAME + std::to_string(device)) {}

~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
void* get_extra_ptr(size_t size) {
void *buffer;
ACL_CHECK(aclrtMalloc(&buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
dev_extra_ptrs.push_back(buffer);
return buffer;
}

~ggml_backend_cann_buffer_context() {
ACL_CHECK(aclrtFree(dev_ptr));
for (auto dev_extra_ptr : dev_extra_ptrs) {
ACL_CHECK(aclrtFree(dev_extra_ptr));
}
}
};

GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
Expand Down Expand Up @@ -105,33 +117,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
return ctx->dev_ptr;
}

GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
if (tensor->view_src != NULL && tensor->view_offs == 0) {
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
tensor->backend = tensor->view_src->backend;
tensor->extra = tensor->view_src->extra;
return;
}

tensor->backend = GGML_BACKEND_TYPE_GPU;

// TODO: can backend doesn't support quantized yet. Just leave the code
// here.
if (ggml_is_quantized(tensor->type)) {
// Initialize padding to 0 to avoid possible NaN values
size_t original_size = ggml_nbytes(tensor);
size_t padded_size =
ggml_backend_buft_get_alloc_size(buffer->buft, tensor);

if (padded_size > original_size && tensor->view_src == nullptr) {
size_t memset_size = padded_size - original_size;
ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
memset_size, 0, memset_size));
}
}
}

GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, const void *src, void* dst) {
GGML_ASSERT(tensor->extra == nullptr);
GGML_ASSERT(tensor->op == GGML_OP_NONE);
Expand Down Expand Up @@ -199,7 +184,6 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor* t
}

GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, const void *src, void* dst) {
GGML_ASSERT(tensor->extra == nullptr);
GGML_ASSERT(tensor->op == GGML_OP_NONE);

size_t n_bytes = ggml_nbytes(tensor);
Expand All @@ -221,7 +205,6 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, cons
}

GGML_CALL static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor* tensor, const void *src, void* dst) {
GGML_ASSERT(tensor->extra == nullptr);
GGML_ASSERT(tensor->op == GGML_OP_NONE);

size_t n_bytes = ggml_nbytes(tensor);
Expand Down Expand Up @@ -279,6 +262,48 @@ GGML_CALL static bool need_transform(ggml_type type) {
}
}

static void set_tensor_extra(ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
// if tensor is need transform, make sure all meta data are copied to
// npu.
// TODO: All tensors should copy meta data to npu, but extra is used to
// record memory usage. Only used for perf test.
size_t tensor_meta_size = sizeof(ggml_tensor);
ggml_backend_cann_buffer_context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context;
tensor->extra = ctx->get_extra_ptr(tensor_meta_size);
ACL_CHECK(aclrtMemcpy(tensor->extra, tensor_meta_size, tensor,
tensor_meta_size, ACL_MEMCPY_HOST_TO_DEVICE));
}

GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
if (tensor->view_src != NULL && tensor->view_offs == 0) {
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
tensor->backend = tensor->view_src->backend;
set_tensor_extra(buffer, tensor);
return;
}

tensor->backend = GGML_BACKEND_TYPE_GPU;

// TODO: can backend doesn't support quantized yet. Just leave the code
// here.
if (ggml_is_quantized(tensor->type)) {
// Initialize padding to 0 to avoid possible NaN values
size_t original_size = ggml_nbytes(tensor);
size_t padded_size =
ggml_backend_buft_get_alloc_size(buffer->buft, tensor);

if (padded_size > original_size && tensor->view_src == nullptr) {
size_t memset_size = padded_size - original_size;
ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
memset_size, 0, memset_size));
}
}
set_tensor_extra(buffer, tensor);
}

// TODO: need handle tensor which pas paddings.
GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data,
size_t offset, size_t size) {
Expand Down
22 changes: 6 additions & 16 deletions ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1610,22 +1610,12 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src0 = dst->src[0];
ggml_tensor* src1 = dst->src[1];

get_row_param param;
param.indices_ne[0] = src1->ne[0];
param.indices_ne[1] = src1->ne[1];
param.input_ne[0] = src0->ne[0];
param.input_ne[1] = src0->ne[1];
param.input_ne[2] = src0->ne[2];

void *buffer;
ACL_CHECK(aclrtMalloc(&buffer, sizeof(get_row_param), ACL_MEM_MALLOC_HUGE_FIRST));

ACL_CHECK(aclrtMemcpy(buffer, sizeof(get_row_param), &param, sizeof(get_row_param), ACL_MEMCPY_HOST_TO_DEVICE));

aclrtlaunch_ascendc_get_row_q8_0(1, ctx.stream(), src0->data, src1->data, dst->data, buffer);

ACL_CHECK(aclrtFree(buffer));

aclrtlaunch_ascendc_get_row_q8_0(1, ctx.stream(), src0->data, src1->data,
dst->data, ((ggml_tensor*)src0->extra)->ne,
((ggml_tensor*)src1->extra)->ne,
((ggml_tensor*)src1->extra)->nb,
((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb);
}

void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
Expand Down
16 changes: 7 additions & 9 deletions ggml-cann/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ struct ggml_backend_cann_context {
aclrtEvent copy_event = nullptr;

aclrtStream streams[GGML_CANN_MAX_STREAMS] = {{nullptr}};
int stream_ids[GGML_CANN_MAX_STREAMS] = {0};

// bind temp buffers to stream. Free after sync.
std::multimap<ggml_tensor*, void*> buffers[GGML_CANN_MAX_STREAMS];
Expand Down Expand Up @@ -109,20 +108,20 @@ struct ggml_backend_cann_context {
// Remove it from stream buffers to avoid double free.
void free_tensor_buffers(ggml_tensor* dst) {
// ggml_tensor.extra means which stream are tensor in.
if (dst->extra != nullptr) {
int stream = *((int*)dst->extra);
for (auto pos = buffers[stream].equal_range(dst); pos.first != pos.second;
++pos.first) {
ACL_CHECK(aclrtFree(pos.first->second));
for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
if (streams[i] != nullptr) {
for (auto pos = buffers[i].equal_range(dst);
pos.first != pos.second; ++pos.first) {
ACL_CHECK(aclrtFree(pos.first->second));
}
buffers[i].erase(dst);
}
buffers[stream].erase(dst);
}
}

aclrtStream stream(int stream) {
if (streams[stream] == nullptr) {
ggml_cann_set_device(device);
stream_ids[stream] = stream;
ACL_CHECK(aclrtCreateStream(&streams[stream]));
}
return streams[stream];
Expand All @@ -134,7 +133,6 @@ struct ggml_backend_cann_context {
// 2. after stream sync.
void bind_buffer(ggml_tensor* dst, void* buf, int stream) {
buffers[stream].insert(std::make_pair(dst, buf));
dst->extra = &(stream_ids[stream]);
}

aclrtStream stream() { return stream(0); }
Expand Down
4 changes: 3 additions & 1 deletion ggml-cann/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@ include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)

ascendc_library(ascendc_kernels STATIC
${SRC_FILES}
)
)

#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
Loading

0 comments on commit 4e7889e

Please sign in to comment.