diff --git a/README-qnn.md b/README-qnn.md new file mode 100644 index 00000000000000..ce01d6b78fdf14 --- /dev/null +++ b/README-qnn.md @@ -0,0 +1,130 @@ +# llama.cpp for QNN + +- [Background](#background) +- [News](#news) +- [OS](#os) +- [Hardware](#hardware) +- [Android](#android) +- [Windows](#windows) +- [Q&A](#qa) +- [TODO](#todo) + +## Background + +Android maintained its position as the leading mobile operating system worldwide in the fourth quarter of 2023 with a market share of 70.1 percent . Qualcomm is No.1 mobile SoC semiconductor company in our planet currently. + + +**QNN**(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK is verified to work with the following versions of the ML frameworks: + + + + +The Qualcomm® AI Engine Direct architecture is designed to be modular and allows for clean separation in the software for different hardware cores/accelerators such as the CPU, GPU and DSP that are designated as backends. Learn more about Qualcomm® AI Engine Direct backends here. + +![Screenshot from 2024-04-14 11-42-14](https://github.com/zhouwg/kantv/assets/6889919/5d8de93a-7b02-4d6b-8b7f-19d2f829dd4d) + +The Qualcomm® AI Engine Direct backends for different hardware cores/accelerators are compiled into individual core-specific libraries that come packaged with the SDK. + + +One of the key highlights of Qualcomm® AI Engine Direct is that it provides a unified API to delegate operations such as graph creation and execution across all hardware accelerator backends. This allows users to treat Qualcomm® AI Engine Direct as a hardware abstraction API and port applications easily to different cores. + + +The Qualcomm® AI Engine Direct API is designed to support an efficient execution model with capabilities such as graph optimizations to be taken care of internally. At the same time however, it leaves out broader functionality such as model parsing and network partitioning to higher level frameworks. + +Qualcomm® AI Engine Direct API and the associated software stack provides all the constructs required by an application to construct, optimize and execute network models on the desired hardware accelerator core. Key constructs are illustrated by the Qualcomm AI Engine Direct Components - High Level View diagram. + + +![qnn-arch](https://github.com/zhouwg/kantv/assets/6889919/4f4881a6-9a91-4477-aeb2-193591375d75) + + + +### Llama.cpp + QNN + +The llama.cpp QNN backend is intented to support **Qualcomm mobile SoC** firstly. + + +## News + +- 2024.4.24 + - PR to ggml community + - data path works fine as expected with whisper.cpp and llama.cpp using QNN backend and verified on both low-end and high-end Android phones based on Qualcomm mobile SoC + - Support OPs + - GGML_OP_ADD + - GGML_OP_MUL + - GGML_OP_MUL_MAT + +- 2024.3.29 + - launch "PoC:add QNN backend for Qualcomm mobile SoC" + +## OS + +| OS | Status | Verified | +|-------------------|---------|------------------------------------| +| Android | Support | Android 10, Android 14 | +| Windows over ARM | TBD | TBD | + + +## Hardware + +### Qualcomm mobile SoC based Android phone + +**Verified devices** + +| Qualcom mobile SoC | Status | Verified Vendor | +|-----------------------------------------|---------|---------------------------------------| +| Qualcomm SM8650-AB Snapdragon 8 Gen 3 | Support | Xiaomi 14 | +| Qualcomm low-end mobile SoC Series | Support | Vivo | + +### Qualcomm SoC based Windows + +TBD + +## Android + +### I. Setup Environment + +Any **mainstream** Android phone based on Qualcomm's mobile SoC should be supported by llama.cpp + QNN. Qualcomm SM8650-AB Snapdragon 8 Gen 3 based Android phone is preferred. + +### II. Build llama.cpp + QNN backend + + +Please refer to [project kantv](https://github.com/zhouwg/kantv) firstly. + + +A small and standalone Android example(or re-use [the existing Android example in llama.cpp](https://github.com/ggerganov/llama.cpp/tree/master/examples/llama.android)) for purpose of facilitate community developers to participate in develop/verify QNN backend. + + +### III. Run the inference on Qualcomm mobile SoC based Android phone + + +![504893116](https://github.com/zhouwg/kantv/assets/6889919/51f0b277-eca4-4938-86f5-415dbf5897e7) + + +## Windows + +TBD + +## Q&A + +TBD + +### **GitHub contribution**: +Please add the **[ggml-qnn]** prefix/tag in issues/PRs titles to help the community check/address them without delay. + +## TODO + +- only support FP32 / FP16 and the input and output tensors must be of the same data type + +- lack of [implementation of other GGML-OPs using QNN API](https://github.com/zhouwg/llama.cpp/blob/qualcomm_qnn_backend_for_ggml/ggml-qnn.cpp#L3452). this work is very similar to GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT in ggml-qnn.cpp + +- multithreading not working with QNN GPU&HTP (aka DSP) backend + + +- QNN's RPC feature(which useful for QNN HTP(aka DSP) backend) not used + +- multi QNN backend(CPU/GPU/DSP) simultaneously not support diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 5d698f184c25df..36b2abf5b2eb53 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1,33 +1,3 @@ -/* - * MIT license - * Copyright (C) 2024 GGML Authors - * SPDX-License-Identifier: MIT - * - * this is implementation of ggml QNN(Qualcomm Neural Network, aka AI Engine Direct) backend - * - * status: - * - * 1. core implementation(data path works fine as expected with whisper.cpp using QNN CPU/GPU backend on Qualcomm's SoC based low-end phone - * - * 2. core implementation(data path works fine as expected with whisper.cpp using QNN HTP(aka DSP) backend on Qualcomm's soC based high-end phone - * - * 3. core implementation(data path works fine as expected with llama.cpp using QNN CPU/GPU/HTP(aka DSP) backend on Qualcomm's soC based high-end phone - * - * 4. GGML_OP_MUL_MAT & GGML_OP_MUL & GGML_OP_ADD using QNN API has been completed - * - * todo: - * - * 1. lack of implementation of other GGML-OPs using QNN API - * - * 2. only support FP32 / FP16 and the input and output tensors must be of the same data type - * - * 3. QNN's RPC feature(which useful for QNN HTP(aka DSP) backend) not used - * - * 4. multi QNN backend(CPU/GPU/DSP) simultaneously not support - * - * 5. multithreading not work with QNN GPU/HTP(aka DSP) backend - * - */ #include #include #include @@ -89,6 +59,19 @@ class qnn_instance; //TODO: should be removed because this is a workaround method during development stage +//a minor modification is required during development stage for validate QNN backend on Android phone: +// +//modify from +// +//static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) +// +//to +// +//void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) +// +//in source file ggml.c#L16156 +// +//this workaround will not be needed when the final QNN backend is complete extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); #if (defined __ANDROID__) || (defined ANDROID) //Qualcomm's QNN could running on Windows over ARM(aka WoA) @@ -838,7 +821,7 @@ static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { - if (!dst || !src || !dstSize || !copySize) + if (!dst || !src || !dstSize || !copySize) return 0; size_t minSize = dstSize < copySize ? dstSize : copySize; @@ -946,7 +929,7 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { QNN_LOG_INFO("it should not happen, pls check"); } else { //TODO:why crash in here? why pointer changed with mul_mat? - //memory leak after comment above line + //memory leak after comment below line //free(QNN_TENSOR_GET_DIMENSIONS(tensor)); } @@ -1043,7 +1026,7 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { } -//TODO: +//TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { switch (ggmlop) { case GGML_OP_ADD: @@ -1204,16 +1187,10 @@ static buf_element_t * qnn_buf_buffer_get (qnn_buf_t * fifo) { buf_element_t * buf = nullptr; pthread_mutex_lock (&fifo->mutex); -#if 0 - while (fifo->first == nullptr) { - pthread_cond_wait (&fifo->not_empty, &fifo->mutex); - } -#else if (fifo->first == nullptr) { pthread_mutex_unlock (&fifo->mutex); return nullptr; } -#endif buf = fifo->first; @@ -1449,9 +1426,9 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { #if (defined __ANDROID__) || (defined ANDROID) - __android_log_print(level, "llamacpp", "%s", s_ggml_qnn_log_internal_buf); + __android_log_print(level, "ggml-qnn", "%s", s_ggml_qnn_log_internal_buf); #else - printf("%s", buffer); //Qualcomm's QNN could running on Window over ARM + printf("%s", buffer); //Qualcomm's QNN could running on Windows over ARM(aka WoA) #endif } va_end(args); @@ -2095,11 +2072,11 @@ int qnn_instance::load_system() { _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not pen QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); return 1; } - auto *get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( _system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); @@ -2223,7 +2200,7 @@ static void ggml_qnn_logcallback(const char * fmt, int len_content = 0; memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); len_content = vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); - //QNN_LOG_DEBUG("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf); + QNN_LOG_DEBUG("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf); } } @@ -2303,15 +2280,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_INFO("create device successfully\n"); } - /* - std::vector temp_device_config; - _qnn_interface.qnn_device_create(_qnn_log_handle, temp_device_config.empty() ? nullptr : temp_device_config.data(), &_qnn_device_handle); - if (nullptr == _qnn_device_handle) { - QNN_LOG_WARN("why failed to initialize qnn device\n"); - //return 6; - } - */ - if (ggml_qnn_profile_level::profile_off != _profile_level) { QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); if (ggml_qnn_profile_level::profile_basic == _profile_level) { @@ -2377,7 +2345,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } -//QNN SDK would/might/should release all allocated resource in SDK's internal +//QNN SDK would/might/should release all allocated internal QNN resource in SDK's internal int qnn_instance::qnn_finalize() { int ret_status = 0; Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -3592,7 +3560,6 @@ bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_t } - //ok, real show time in Qualcomm's QNN internal if (nullptr != func) func(tensor->src[0], tensor->src[1], tensor); if (nullptr != func_common) @@ -3832,7 +3799,7 @@ static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_typ static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); - return (38 * 1024 * 1024); + return (96 * 1024 * 1024); } @@ -4429,6 +4396,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ } +#if 0 //replaced with ggml_status ggml_backend_qnn_graph_compute_multithread static void * ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; @@ -4563,6 +4531,7 @@ static void * ggml_graph_compute_thread(void * data) { return 0; } +#endif static ggml_status ggml_backend_qnn_graph_compute_multithread(ggml_backend_t backend, ggml_cgraph * cgraph) { @@ -4579,6 +4548,7 @@ static ggml_status ggml_backend_qnn_graph_compute_multithread(ggml_backend_t bac if (plan.work_size > 0) { //QNN_LOG_INFO("work size %d(%d MB)", plan.work_size, plan.work_size / (1 << 20)); + //TODO:using memory pool to avoid dynamic memory allocation/free plan.work_data = static_cast(malloc(plan.work_size)); if (plan.work_data == nullptr) { QNN_LOG_ERROR("malloc failed"); @@ -4650,6 +4620,7 @@ static ggml_status ggml_backend_qnn_graph_compute_multithread(ggml_backend_t bac } if (plan.work_data != nullptr) { + //TODO:using memory pool to avoid dynamic memory allocation/free free(plan.work_data); } @@ -4766,7 +4737,8 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { /** * * @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP) - * @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer + * @param qnn_lib_path this param is Andorid APP's data name, such as "/data/data/com.ggml.llamacpp/" + * which can be obtained through JNI from Java layer * @return */ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { diff --git a/ggml-qnn.h b/ggml-qnn.h index 51f02d4ba3078f..6220a8a3fed0d0 100644 --- a/ggml-qnn.h +++ b/ggml-qnn.h @@ -1,10 +1,3 @@ -/* - * MIT license - * Copyright (C) 2024 GGML Authors - * SPDX-License-Identifier: MIT - * - * this is implementation of ggml QNN(Qualcomm Nerual Network, aka AI Engine Direct) backend - */ #pragma once #include "ggml.h" @@ -30,7 +23,8 @@ GGML_API int ggml_backend_qnn_reg_devices(); /** * * @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP) - * @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer + * @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" + * which can be obtained through JNI from Java layer * @return */ GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path); @@ -45,9 +39,8 @@ GGML_API void ggml_backend_qnn_get_device_description(int device, char GGML_API ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); - -//temporary API, should be removed in the future -GGML_API bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +// TODO: this is a temporary API, should be removed in the future +GGML_API bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); #ifdef __cplusplus diff --git a/ggml.c b/ggml.c index 919eb0b7b1ff16..086db96af7fcd1 100644 --- a/ggml.c +++ b/ggml.c @@ -16153,8 +16153,7 @@ static void ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// -//workaround for Qualcomm QNN backend -void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) { diff --git a/llama.cpp b/llama.cpp index a10c3e1fc8488b..451d0aadb51a05 100644 --- a/llama.cpp +++ b/llama.cpp @@ -15403,7 +15403,7 @@ struct llama_context * llama_new_context_with_model( #elif defined(GGML_USE_QNN) if (model->n_gpu_layers > 0) { //the second param is package name of Andorid app, can be got by JNI from Java layer - ggml_backend_t backend = ggml_backend_qnn_init(QNN_CPU, "/data/data/com.ggml.llamacpp/"); + ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, "/data/data/com.ggml.llamacpp/"); if (nullptr == backend) { LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); llama_free(ctx); @@ -17577,14 +17577,6 @@ void llama_reset_timings(struct llama_context * ctx) { ctx->t_p_eval_us = ctx->n_p_eval = 0; } -static int llama_has_qnn(void) { -#ifdef GGML_USE_QNN - return 1; -#else - return 0; -#endif -} - const char * llama_print_system_info(void) { static std::string s; @@ -17606,7 +17598,6 @@ const char * llama_print_system_info(void) { s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; - s += "QNN = " + std::to_string(llama_has_qnn()) + " | "; return s.c_str(); }